Spaces:
Sleeping
Sleeping
Commit ·
6b6d1fd
1
Parent(s): a87ef04
add app
Browse files- app.py +1706 -0
- requirements.txt +357 -0
app.py
ADDED
|
@@ -0,0 +1,1706 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Scimplify.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/11L85VXrmvxrfXd6A9FGuJjI53nVtM0tN
|
| 8 |
+
|
| 9 |
+
# Scimplify
|
| 10 |
+
|
| 11 |
+
A NeuroAI paper simplifier. You paste a paragraph and get a plain-language
|
| 12 |
+
explanation back, with citations to the retrieved chunks the explanation
|
| 13 |
+
came from. The system refuses to answer if it can't ground the claims.
|
| 14 |
+
|
| 15 |
+
## 1. Setup
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import os, json, re, io, textwrap, time
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from collections import Counter, defaultdict
|
| 21 |
+
from typing import List, Dict, Tuple, Optional
|
| 22 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 23 |
+
|
| 24 |
+
import requests
|
| 25 |
+
import numpy as np
|
| 26 |
+
import pandas as pd
|
| 27 |
+
import matplotlib.pyplot as plt
|
| 28 |
+
|
| 29 |
+
import openai
|
| 30 |
+
import chromadb
|
| 31 |
+
from chromadb.utils import embedding_functions
|
| 32 |
+
import gradio as gr
|
| 33 |
+
from sentence_transformers import SentenceTransformer
|
| 34 |
+
from PyPDF2 import PdfReader
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
import os
|
| 39 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 40 |
+
except (ImportError, Exception):
|
| 41 |
+
assert os.environ.get("OPENAI_API_KEY"), "Set OPENAI_API_KEY env var"
|
| 42 |
+
|
| 43 |
+
client_oai = openai.OpenAI()
|
| 44 |
+
|
| 45 |
+
GENERATOR_MODEL = "gpt-4o-mini"
|
| 46 |
+
|
| 47 |
+
JUDGE_MODEL = "gpt-4o-mini"
|
| 48 |
+
GENERATOR_TEMPERATURE = 0.2
|
| 49 |
+
JUDGE_TEMPERATURE = 0.3
|
| 50 |
+
JUDGE_N_SAMPLES = 3
|
| 51 |
+
BOOTSTRAP_N = 2000
|
| 52 |
+
BOOTSTRAP_ALPHA = 0.05
|
| 53 |
+
_rng = np.random.default_rng(7)
|
| 54 |
+
|
| 55 |
+
RUN_EXPERIMENTS = False # re-run experiments
|
| 56 |
+
LIVE_SEMANTIC_CHECK = True # adds 1s per query
|
| 57 |
+
JUDGE_PARALLELISM = 2 # rate limit cap
|
| 58 |
+
|
| 59 |
+
print(f"generator: {GENERATOR_MODEL}")
|
| 60 |
+
print(f"judge: {JUDGE_MODEL}")
|
| 61 |
+
print(f"experiments: {'WILL RE-RUN' if RUN_EXPERIMENTS else 'using cached results'}")
|
| 62 |
+
print(f"live semantic check: {'on' if LIVE_SEMANTIC_CHECK else 'off'}")
|
| 63 |
+
|
| 64 |
+
"""## 2. Data loading"""
|
| 65 |
+
|
| 66 |
+
REPO_RAW_BASE = "https://raw.githubusercontent.com/martazavro/scimplify_data/main"
|
| 67 |
+
LOCAL_DATA_DIR = Path("./data")
|
| 68 |
+
|
| 69 |
+
def _load_json(filename):
|
| 70 |
+
url = f"{REPO_RAW_BASE}/{filename}"
|
| 71 |
+
try:
|
| 72 |
+
r = requests.get(url, timeout=10)
|
| 73 |
+
r.raise_for_status()
|
| 74 |
+
print(f"loaded {filename} from repo")
|
| 75 |
+
return r.json()
|
| 76 |
+
except Exception as e:
|
| 77 |
+
local = LOCAL_DATA_DIR / filename
|
| 78 |
+
if local.exists():
|
| 79 |
+
print(f"repo fetch failed ({e.__class__.__name__}), loaded {filename} from local")
|
| 80 |
+
return json.loads(local.read_text())
|
| 81 |
+
raise FileNotFoundError(
|
| 82 |
+
f"Could not load {filename}. Set REPO_RAW_BASE correctly "
|
| 83 |
+
f"or place the file in ./data/{filename}"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
neuroai_concepts = _load_json("concepts.json")
|
| 87 |
+
print(f"loaded {len(neuroai_concepts)} concepts")
|
| 88 |
+
|
| 89 |
+
def validate_validation_set(vs):
|
| 90 |
+
items = vs["items"]
|
| 91 |
+
ids = [x["id"] for x in items]
|
| 92 |
+
assert len(set(ids)) == len(ids), "duplicate ids"
|
| 93 |
+
required = {"id", "passage", "source", "key_terms", "category", "difficulty", "reference_explanation"}
|
| 94 |
+
valid_cats = {"concepts_only", "recent_paper", "both", "neither"}
|
| 95 |
+
for item in items:
|
| 96 |
+
missing = required - set(item.keys())
|
| 97 |
+
assert not missing, f"item {item.get('id')} missing {missing}"
|
| 98 |
+
assert item["category"] in valid_cats
|
| 99 |
+
cat_counts = Counter(x["category"] for x in items)
|
| 100 |
+
print(f"validation set: {len(items)} items")
|
| 101 |
+
print(f" by category: {dict(cat_counts)}")
|
| 102 |
+
|
| 103 |
+
validation_set = _load_json("validation_set.json")
|
| 104 |
+
validate_validation_set(validation_set)
|
| 105 |
+
|
| 106 |
+
"""## 3. PDF extraction and chunking"""
|
| 107 |
+
|
| 108 |
+
def extract_text_from_pdf(pdf_file):
|
| 109 |
+
reader = PdfReader(pdf_file)
|
| 110 |
+
text = ""
|
| 111 |
+
for page in reader.pages:
|
| 112 |
+
page_text = page.extract_text()
|
| 113 |
+
if page_text:
|
| 114 |
+
text += page_text + "\n"
|
| 115 |
+
return text.strip()
|
| 116 |
+
|
| 117 |
+
def chunk_text(text, chunk_size=300, overlap=50):
|
| 118 |
+
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
|
| 119 |
+
chunks, current, current_len = [], [], 0
|
| 120 |
+
for para in paragraphs:
|
| 121 |
+
words = para.split()
|
| 122 |
+
n = len(words)
|
| 123 |
+
if n > chunk_size:
|
| 124 |
+
if current:
|
| 125 |
+
chunks.append(" ".join(current))
|
| 126 |
+
tail = current[-overlap:] if len(current) > overlap else current
|
| 127 |
+
current = list(tail); current_len = len(current)
|
| 128 |
+
for i in range(0, n, chunk_size - overlap):
|
| 129 |
+
chunk = words[i:i+chunk_size]
|
| 130 |
+
if len(chunk) > 30:
|
| 131 |
+
chunks.append(" ".join(chunk))
|
| 132 |
+
current = []; current_len = 0
|
| 133 |
+
elif current_len + n > chunk_size:
|
| 134 |
+
chunks.append(" ".join(current))
|
| 135 |
+
tail = current[-overlap:] if len(current) > overlap else current
|
| 136 |
+
current = list(tail) + words; current_len = len(current)
|
| 137 |
+
else:
|
| 138 |
+
current.extend(words); current_len += n
|
| 139 |
+
if current and len(current) > 30:
|
| 140 |
+
chunks.append(" ".join(current))
|
| 141 |
+
return chunks
|
| 142 |
+
|
| 143 |
+
"""## 4. Vector store setup"""
|
| 144 |
+
|
| 145 |
+
chroma_client = chromadb.Client()
|
| 146 |
+
ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
| 147 |
+
|
| 148 |
+
def reset_concepts_collection():
|
| 149 |
+
try:
|
| 150 |
+
chroma_client.delete_collection("neuroai_concepts")
|
| 151 |
+
except Exception:
|
| 152 |
+
pass
|
| 153 |
+
coll = chroma_client.create_collection(name="neuroai_concepts", embedding_function=ef)
|
| 154 |
+
for entry in neuroai_concepts:
|
| 155 |
+
doc = (
|
| 156 |
+
f"Concept: {entry['concept']}\n"
|
| 157 |
+
f"Definition: {entry['definition']}\n"
|
| 158 |
+
f"Context: {entry['context']}\n"
|
| 159 |
+
f"Typically found in: {entry['typical_usage']}"
|
| 160 |
+
)
|
| 161 |
+
coll.add(
|
| 162 |
+
documents=[doc],
|
| 163 |
+
ids=[entry["id"]],
|
| 164 |
+
metadatas=[{"concept_name": entry["concept"], "concept_id": entry["id"]}]
|
| 165 |
+
)
|
| 166 |
+
return coll
|
| 167 |
+
|
| 168 |
+
def reset_papers_collection():
|
| 169 |
+
try:
|
| 170 |
+
chroma_client.delete_collection("neuroai_papers")
|
| 171 |
+
except Exception:
|
| 172 |
+
pass
|
| 173 |
+
return chroma_client.create_collection(name="neuroai_papers", embedding_function=ef)
|
| 174 |
+
|
| 175 |
+
concepts_collection = reset_concepts_collection()
|
| 176 |
+
papers_collection = reset_papers_collection()
|
| 177 |
+
print(f"concepts: {concepts_collection.count()}, papers: {papers_collection.count()}")
|
| 178 |
+
|
| 179 |
+
"""## 5. Recent papers ingestion"""
|
| 180 |
+
|
| 181 |
+
PAPER_CHUNKS_URL = f"{REPO_RAW_BASE}/paper_chunks.json"
|
| 182 |
+
|
| 183 |
+
def load_paper_chunks():
|
| 184 |
+
r = requests.get(PAPER_CHUNKS_URL, timeout=15)
|
| 185 |
+
r.raise_for_status()
|
| 186 |
+
return r.json()
|
| 187 |
+
|
| 188 |
+
def ingest_paper_chunks_from_json():
|
| 189 |
+
chunks = load_paper_chunks()
|
| 190 |
+
if not chunks:
|
| 191 |
+
print("paper_chunks.json was empty")
|
| 192 |
+
return 0
|
| 193 |
+
|
| 194 |
+
documents = [c["text"] for c in chunks]
|
| 195 |
+
ids = [c["chunk_id"] for c in chunks]
|
| 196 |
+
metadatas = [{
|
| 197 |
+
"source_name": c["source_name"],
|
| 198 |
+
"source_type": c["source_type"],
|
| 199 |
+
"arxiv_id": c["arxiv_id"],
|
| 200 |
+
"title": c["title"],
|
| 201 |
+
"chunk_idx": c["chunk_idx"],
|
| 202 |
+
"chunk_id": c["chunk_id"],
|
| 203 |
+
} for c in chunks]
|
| 204 |
+
|
| 205 |
+
papers_collection.add(documents=documents, ids=ids, metadatas=metadatas)
|
| 206 |
+
|
| 207 |
+
by_paper = {}
|
| 208 |
+
for c in chunks:
|
| 209 |
+
by_paper[c["arxiv_id"]] = by_paper.get(c["arxiv_id"], 0) + 1
|
| 210 |
+
for aid, n in by_paper.items():
|
| 211 |
+
print(f" {aid}: {n} chunks")
|
| 212 |
+
print(f"papers_collection now has {papers_collection.count()} total chunks")
|
| 213 |
+
return len(chunks)
|
| 214 |
+
|
| 215 |
+
ingest_paper_chunks_from_json()
|
| 216 |
+
|
| 217 |
+
"""## 6. arXiv ingestion"""
|
| 218 |
+
|
| 219 |
+
import arxiv
|
| 220 |
+
|
| 221 |
+
def _existing_arxiv_ids():
|
| 222 |
+
if papers_collection.count() == 0:
|
| 223 |
+
return set()
|
| 224 |
+
metas = papers_collection.get()["metadatas"]
|
| 225 |
+
return {m.get("arxiv_id") for m in metas if m.get("arxiv_id")}
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def ingest_from_arxiv(query="neuroAI OR (neural AND brain AND deep learning)",
|
| 229 |
+
max_results=10,
|
| 230 |
+
sort_by_recent=True,
|
| 231 |
+
verbose=True):
|
| 232 |
+
"""Search arXiv, download PDFs, chunk them, add to papers_collection.
|
| 233 |
+
|
| 234 |
+
Returns dict with stats: {n_papers, n_chunks, n_skipped, errors}.
|
| 235 |
+
Already-ingested papers (matched by arxiv_id) are skipped.
|
| 236 |
+
"""
|
| 237 |
+
sort_by = arxiv.SortCriterion.SubmittedDate if sort_by_recent else arxiv.SortCriterion.Relevance
|
| 238 |
+
arxiv_client = arxiv.Client(page_size=20, delay_seconds=3.0, num_retries=3)
|
| 239 |
+
search = arxiv.Search(query=query, max_results=max_results, sort_by=sort_by)
|
| 240 |
+
|
| 241 |
+
existing = _existing_arxiv_ids()
|
| 242 |
+
download_dir = Path("./arxiv_papers")
|
| 243 |
+
download_dir.mkdir(exist_ok=True)
|
| 244 |
+
|
| 245 |
+
n_papers, n_chunks, n_skipped = 0, 0, 0
|
| 246 |
+
errors = []
|
| 247 |
+
|
| 248 |
+
for result in arxiv_client.results(search):
|
| 249 |
+
# arxiv.org/abs/2509.23566v1 -> "2509.23566"
|
| 250 |
+
full_id = result.entry_id.rsplit("/", 1)[-1]
|
| 251 |
+
arxiv_id = full_id.split("v")[0]
|
| 252 |
+
|
| 253 |
+
if arxiv_id in existing:
|
| 254 |
+
n_skipped += 1
|
| 255 |
+
if verbose:
|
| 256 |
+
print(f" skip {arxiv_id} (already ingested)")
|
| 257 |
+
continue
|
| 258 |
+
|
| 259 |
+
try:
|
| 260 |
+
if verbose:
|
| 261 |
+
print(f" fetching {arxiv_id}: {result.title[:60]}...")
|
| 262 |
+
pdf_path = result.download_pdf(dirpath=str(download_dir),
|
| 263 |
+
filename=f"{arxiv_id}.pdf")
|
| 264 |
+
text = extract_text_from_pdf(pdf_path)
|
| 265 |
+
chunks = chunk_text(text)
|
| 266 |
+
if not chunks:
|
| 267 |
+
errors.append(f"{arxiv_id}: no chunks extracted")
|
| 268 |
+
continue
|
| 269 |
+
|
| 270 |
+
chunk_ids = [f"arxiv_{arxiv_id.replace('.', '_')}::c{i}" for i in range(len(chunks))]
|
| 271 |
+
metadatas = [{
|
| 272 |
+
"source_name": result.title,
|
| 273 |
+
"source_type": "arxiv_paper",
|
| 274 |
+
"arxiv_id": arxiv_id,
|
| 275 |
+
"title": result.title,
|
| 276 |
+
"chunk_idx": i,
|
| 277 |
+
"chunk_id": chunk_ids[i],
|
| 278 |
+
} for i in range(len(chunks))]
|
| 279 |
+
|
| 280 |
+
papers_collection.add(documents=chunks, ids=chunk_ids, metadatas=metadatas)
|
| 281 |
+
existing.add(arxiv_id) # avoid double-add within the same call
|
| 282 |
+
n_papers += 1
|
| 283 |
+
n_chunks += len(chunks)
|
| 284 |
+
if verbose:
|
| 285 |
+
print(f" -> added {len(chunks)} chunks")
|
| 286 |
+
except Exception as e:
|
| 287 |
+
errors.append(f"{arxiv_id}: {e.__class__.__name__}: {e}")
|
| 288 |
+
if verbose:
|
| 289 |
+
print(f" ERROR: {e}")
|
| 290 |
+
|
| 291 |
+
summary = {
|
| 292 |
+
"n_papers": n_papers,
|
| 293 |
+
"n_chunks": n_chunks,
|
| 294 |
+
"n_skipped": n_skipped,
|
| 295 |
+
"errors": errors,
|
| 296 |
+
"total_in_kb": papers_collection.count(),
|
| 297 |
+
}
|
| 298 |
+
if verbose:
|
| 299 |
+
print(f"\ningested {n_papers} papers ({n_chunks} chunks), skipped {n_skipped} duplicates")
|
| 300 |
+
if errors:
|
| 301 |
+
print(f"errors: {len(errors)}")
|
| 302 |
+
print(f"total in knowledge base: {summary['total_in_kb']} chunks")
|
| 303 |
+
return summary
|
| 304 |
+
|
| 305 |
+
ingest_from_arxiv(query="NeuroAI", max_results=2)
|
| 306 |
+
|
| 307 |
+
ingest_from_arxiv(query="NeuroAI", max_results=15)
|
| 308 |
+
|
| 309 |
+
"""## 7. Retrieval variants"""
|
| 310 |
+
|
| 311 |
+
def _flexible_last_word(word):
|
| 312 |
+
if len(word) < 4:
|
| 313 |
+
return re.escape(word)
|
| 314 |
+
stem = re.escape(word[:-2])
|
| 315 |
+
return stem + r"[a-z]{0,4}"
|
| 316 |
+
|
| 317 |
+
def build_concept_patterns(concept_entry):
|
| 318 |
+
name = concept_entry["concept"]
|
| 319 |
+
abbrev_match = re.search(r"\(([^)]+)\)", name)
|
| 320 |
+
abbrev = abbrev_match.group(1) if abbrev_match else None
|
| 321 |
+
base = re.sub(r"\s*\([^)]+\)", "", name).strip()
|
| 322 |
+
|
| 323 |
+
patterns = []
|
| 324 |
+
words = base.split()
|
| 325 |
+
if len(words) == 1:
|
| 326 |
+
long_re = r"\b" + _flexible_last_word(words[0]) + r"\b"
|
| 327 |
+
else:
|
| 328 |
+
parts = [re.escape(w) for w in words[:-1]] + [_flexible_last_word(words[-1])]
|
| 329 |
+
long_re = r"\b" + r"\s+".join(parts) + r"\b"
|
| 330 |
+
patterns.append(re.compile(long_re, re.IGNORECASE))
|
| 331 |
+
|
| 332 |
+
if abbrev:
|
| 333 |
+
patterns.append(re.compile(r"\b" + re.escape(abbrev) + r"s?\b"))
|
| 334 |
+
return patterns
|
| 335 |
+
|
| 336 |
+
CONCEPT_PATTERNS = [(entry, build_concept_patterns(entry)) for entry in neuroai_concepts]
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def _concept_doc_text(entry):
|
| 340 |
+
return (
|
| 341 |
+
f"Concept: {entry['concept']}\n"
|
| 342 |
+
f"Definition: {entry['definition']}\n"
|
| 343 |
+
f"Context: {entry['context']}\n"
|
| 344 |
+
f"Typically found in: {entry['typical_usage']}"
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def regex_retrieve(passage):
|
| 349 |
+
hits = []
|
| 350 |
+
for entry, patterns in CONCEPT_PATTERNS:
|
| 351 |
+
if any(p.search(passage) for p in patterns):
|
| 352 |
+
hits.append({
|
| 353 |
+
"type": "regex_concept",
|
| 354 |
+
"concept_name": entry["concept"],
|
| 355 |
+
"concept_id": entry["id"],
|
| 356 |
+
"chunk_id": entry["id"],
|
| 357 |
+
"content": _concept_doc_text(entry),
|
| 358 |
+
"distance": 0.0,
|
| 359 |
+
"source_method": "regex",
|
| 360 |
+
})
|
| 361 |
+
return hits
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def retrieve_concepts_embedding(passage, n_results=3):
|
| 365 |
+
results = concepts_collection.query(query_texts=[passage], n_results=n_results)
|
| 366 |
+
out = []
|
| 367 |
+
for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
|
| 368 |
+
out.append({
|
| 369 |
+
"type": "concept",
|
| 370 |
+
"concept_name": meta["concept_name"],
|
| 371 |
+
"concept_id": meta.get("concept_id"),
|
| 372 |
+
"chunk_id": meta.get("concept_id"),
|
| 373 |
+
"content": doc,
|
| 374 |
+
"distance": round(dist, 3),
|
| 375 |
+
"source_method": "embedding",
|
| 376 |
+
})
|
| 377 |
+
return out
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def retrieve_paper_chunks(passage, n_results=3):
|
| 381 |
+
if papers_collection.count() == 0:
|
| 382 |
+
return []
|
| 383 |
+
results = papers_collection.query(
|
| 384 |
+
query_texts=[passage],
|
| 385 |
+
n_results=min(n_results, papers_collection.count())
|
| 386 |
+
)
|
| 387 |
+
out = []
|
| 388 |
+
for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
|
| 389 |
+
out.append({
|
| 390 |
+
"type": "paper_chunk",
|
| 391 |
+
"source_name": meta["source_name"],
|
| 392 |
+
"source_type": meta["source_type"],
|
| 393 |
+
"chunk_id": meta.get("chunk_id"),
|
| 394 |
+
"content": doc,
|
| 395 |
+
"distance": round(dist, 3),
|
| 396 |
+
"source_method": "embedding",
|
| 397 |
+
})
|
| 398 |
+
return out
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def hybrid_retrieve_concepts(passage, n_embedding=3, max_total=6):
|
| 402 |
+
|
| 403 |
+
rgx = regex_retrieve(passage)
|
| 404 |
+
seen_names = {h["concept_name"] for h in rgx}
|
| 405 |
+
out = list(rgx)
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
if len(out) < max_total:
|
| 409 |
+
emb = retrieve_concepts_embedding(passage, n_results=n_embedding)
|
| 410 |
+
for hit in emb:
|
| 411 |
+
if hit["concept_name"] not in seen_names:
|
| 412 |
+
out.append(hit)
|
| 413 |
+
seen_names.add(hit["concept_name"])
|
| 414 |
+
else:
|
| 415 |
+
for r in out:
|
| 416 |
+
if r["concept_name"] == hit["concept_name"]:
|
| 417 |
+
r["source_method"] = "both"
|
| 418 |
+
break
|
| 419 |
+
if len(out) >= max_total:
|
| 420 |
+
break
|
| 421 |
+
return out
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
def retrieve_for_variant(passage, variant, n_concepts=3, n_papers=3):
|
| 425 |
+
if variant == "no_rag":
|
| 426 |
+
return [], []
|
| 427 |
+
elif variant == "embedding_only":
|
| 428 |
+
return (retrieve_concepts_embedding(passage, n_results=n_concepts),
|
| 429 |
+
retrieve_paper_chunks(passage, n_results=n_papers))
|
| 430 |
+
elif variant == "regex_only":
|
| 431 |
+
return (regex_retrieve(passage),
|
| 432 |
+
retrieve_paper_chunks(passage, n_results=n_papers))
|
| 433 |
+
elif variant == "hybrid":
|
| 434 |
+
return (hybrid_retrieve_concepts(passage, n_embedding=n_concepts),
|
| 435 |
+
retrieve_paper_chunks(passage, n_results=n_papers))
|
| 436 |
+
else:
|
| 437 |
+
raise ValueError(f"unknown variant: {variant}")
|
| 438 |
+
|
| 439 |
+
"""## 8. Citation-enforced generation with semantic guard
|
| 440 |
+
|
| 441 |
+
"""
|
| 442 |
+
|
| 443 |
+
CITED_SYSTEM_PROMPT = """You are a scientific reading assistant that helps people understand passages from NeuroAI research papers.
|
| 444 |
+
|
| 445 |
+
You have access to retrieved context. Each source has a stable ID in square brackets like [c004] (for a concept definition) or [arxiv_2511_12345::c3] (for a paper chunk).
|
| 446 |
+
|
| 447 |
+
Your job:
|
| 448 |
+
1. Read the passage.
|
| 449 |
+
2. Rewrite it in plain language an undergraduate could follow.
|
| 450 |
+
3. For EVERY factual sentence in your explanation, append one or more citations in square brackets, drawn ONLY from the IDs of the retrieved sources shown to you.
|
| 451 |
+
4. Do not invent citation IDs. Do not cite sources you were not shown.
|
| 452 |
+
5. If the retrieved context does not contain enough information to answer faithfully, output EXACTLY this string and nothing else:
|
| 453 |
+
I don't have enough evidence in the retrieved context.
|
| 454 |
+
|
| 455 |
+
Format:
|
| 456 |
+
**Key terms:** short definitions of technical terms, each with its citation
|
| 457 |
+
**Plain-language version:** the passage rewritten clearly, with citations on every factual sentence
|
| 458 |
+
**What this means in context:** 1-2 sentences on why this matters, with citations
|
| 459 |
+
"""
|
| 460 |
+
|
| 461 |
+
ABSTAIN_MESSAGE = "I don't have enough evidence in the retrieved context."
|
| 462 |
+
CITATION_PATTERN = re.compile(r"\[([a-zA-Z0-9_\-:]+)\]")
|
| 463 |
+
SEMANTIC_FAIL_THRESHOLD = 0.5
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
def _format_context_block(concept_results, paper_results):
|
| 467 |
+
lines = []
|
| 468 |
+
if concept_results:
|
| 469 |
+
lines.append("CONCEPT DEFINITIONS:")
|
| 470 |
+
for r in concept_results:
|
| 471 |
+
cid = r.get("chunk_id") or r.get("concept_id")
|
| 472 |
+
lines.append(f"\n[{cid}] {r['content']}")
|
| 473 |
+
lines.append("---")
|
| 474 |
+
if paper_results:
|
| 475 |
+
lines.append("\nPAPER/ARTICLE CONTEXT:")
|
| 476 |
+
for r in paper_results:
|
| 477 |
+
cid = r.get("chunk_id")
|
| 478 |
+
lines.append(f"\n[{cid}] (from {r['source_name']}): {r['content']}")
|
| 479 |
+
lines.append("---")
|
| 480 |
+
if not concept_results and not paper_results:
|
| 481 |
+
lines.append("(no context retrieved)")
|
| 482 |
+
return "\n".join(lines)
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
def _collect_allowed_ids(concept_results, paper_results):
|
| 486 |
+
ids = set()
|
| 487 |
+
for r in concept_results + paper_results:
|
| 488 |
+
cid = r.get("chunk_id") or r.get("concept_id")
|
| 489 |
+
if cid:
|
| 490 |
+
ids.add(cid)
|
| 491 |
+
return ids
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def _build_chunk_lookup(concept_results, paper_results):
|
| 495 |
+
"""Map citation_id -> chunk content. Used by the semantic check."""
|
| 496 |
+
lookup = {}
|
| 497 |
+
for r in concept_results + paper_results:
|
| 498 |
+
cid = r.get("chunk_id") or r.get("concept_id")
|
| 499 |
+
if cid:
|
| 500 |
+
lookup[cid] = r["content"]
|
| 501 |
+
return lookup
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
def generate_cited_explanation(passage, concept_results, paper_results, model=None):
|
| 505 |
+
model = model or GENERATOR_MODEL
|
| 506 |
+
context_block = _format_context_block(concept_results, paper_results)
|
| 507 |
+
user_msg = f"{context_block}\n\nPASSAGE TO EXPLAIN:\n{passage}"
|
| 508 |
+
resp = client_oai.chat.completions.create(
|
| 509 |
+
model=model,
|
| 510 |
+
temperature=GENERATOR_TEMPERATURE,
|
| 511 |
+
messages=[
|
| 512 |
+
{"role": "system", "content": CITED_SYSTEM_PROMPT},
|
| 513 |
+
{"role": "user", "content": user_msg},
|
| 514 |
+
],
|
| 515 |
+
)
|
| 516 |
+
return resp.choices[0].message.content
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
def validate_citations(answer: str, allowed_ids: set) -> Tuple[bool, List[str]]:
|
| 520 |
+
"""Lexical guard: every citation ID in the answer must be in the allowed set."""
|
| 521 |
+
if ABSTAIN_MESSAGE in answer:
|
| 522 |
+
return True, []
|
| 523 |
+
cited = CITATION_PATTERN.findall(answer)
|
| 524 |
+
issues = []
|
| 525 |
+
if not cited:
|
| 526 |
+
issues.append("No citations found in non-abstain answer")
|
| 527 |
+
for cid in cited:
|
| 528 |
+
if cid not in allowed_ids:
|
| 529 |
+
issues.append(f"Invalid citation: {cid}")
|
| 530 |
+
return len(issues) == 0, issues
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
def _split_into_sentences(text):
|
| 535 |
+
"""Cheap sentence splitter that keeps citation brackets attached."""
|
| 536 |
+
# split on . ! ? followed by space and a capital, keeping the punctuation
|
| 537 |
+
parts = re.split(r"(?<=[.!?])\s+(?=[A-Z*])", text.strip())
|
| 538 |
+
return [p.strip() for p in parts if p.strip()]
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
def _strip_citations(sentence):
|
| 542 |
+
return CITATION_PATTERN.sub("", sentence).strip()
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
def check_sentence_supported(sentence_text, cited_chunks):
|
| 546 |
+
|
| 547 |
+
claim = _strip_citations(sentence_text)
|
| 548 |
+
if len(claim) < 10 or not cited_chunks:
|
| 549 |
+
return {"label": "skipped", "reason": "no claim or no chunks"}
|
| 550 |
+
evidence = "\n\n".join(f"[{cid}]: {text}" for cid, text in cited_chunks)
|
| 551 |
+
return verify_claim_against_evidence(claim, [evidence])
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
def semantic_per_sentence_check(answer, chunk_lookup):
|
| 555 |
+
|
| 556 |
+
if ABSTAIN_MESSAGE in answer:
|
| 557 |
+
return []
|
| 558 |
+
sentences = _split_into_sentences(answer)
|
| 559 |
+
findings = []
|
| 560 |
+
for sent in sentences:
|
| 561 |
+
cited_ids = CITATION_PATTERN.findall(sent)
|
| 562 |
+
if not cited_ids:
|
| 563 |
+
continue
|
| 564 |
+
cited_chunks = [(cid, chunk_lookup[cid]) for cid in cited_ids if cid in chunk_lookup]
|
| 565 |
+
if not cited_chunks:
|
| 566 |
+
continue
|
| 567 |
+
result = check_sentence_supported(sent, cited_chunks)
|
| 568 |
+
findings.append({
|
| 569 |
+
"sentence": sent,
|
| 570 |
+
"citations": cited_ids,
|
| 571 |
+
"label": result["label"],
|
| 572 |
+
"reason": result["reason"],
|
| 573 |
+
})
|
| 574 |
+
return findings
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
def annotate_unsupported_sentences(answer, findings):
|
| 578 |
+
"""Mark unsupported sentences in the rendered output."""
|
| 579 |
+
for f in findings:
|
| 580 |
+
if f["label"] in ("contradicted", "insufficient"):
|
| 581 |
+
marker = "⚠️ "
|
| 582 |
+
if marker not in f["sentence"]:
|
| 583 |
+
answer = answer.replace(f["sentence"], marker + f["sentence"], 1)
|
| 584 |
+
return answer
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
def generate_with_citation_guard(passage, concept_results, paper_results, model=None,
|
| 588 |
+
allow_no_context_bypass=False,
|
| 589 |
+
do_semantic_check=None):
|
| 590 |
+
|
| 591 |
+
do_semantic_check = (do_semantic_check if do_semantic_check is not None
|
| 592 |
+
else LIVE_SEMANTIC_CHECK)
|
| 593 |
+
|
| 594 |
+
if allow_no_context_bypass and not concept_results and not paper_results:
|
| 595 |
+
resp = client_oai.chat.completions.create(
|
| 596 |
+
model=model or GENERATOR_MODEL,
|
| 597 |
+
temperature=GENERATOR_TEMPERATURE,
|
| 598 |
+
messages=[
|
| 599 |
+
{"role": "system", "content": "You are a scientific reading assistant. Explain the given passage in plain language that an undergraduate could follow. Be concise."},
|
| 600 |
+
{"role": "user", "content": f"PASSAGE:\n{passage}"},
|
| 601 |
+
],
|
| 602 |
+
)
|
| 603 |
+
return {
|
| 604 |
+
"answer": resp.choices[0].message.content,
|
| 605 |
+
"valid_citations": None,
|
| 606 |
+
"guard_triggered": False,
|
| 607 |
+
"issues": [],
|
| 608 |
+
"abstained": False,
|
| 609 |
+
"semantic_findings": [],
|
| 610 |
+
"semantic_fail_rate": np.nan,
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
raw = generate_cited_explanation(passage, concept_results, paper_results, model=model)
|
| 614 |
+
allowed_ids = _collect_allowed_ids(concept_results, paper_results)
|
| 615 |
+
ok, issues = validate_citations(raw, allowed_ids)
|
| 616 |
+
|
| 617 |
+
# lexical guard
|
| 618 |
+
if not ok:
|
| 619 |
+
return {
|
| 620 |
+
"answer": ABSTAIN_MESSAGE,
|
| 621 |
+
"valid_citations": False,
|
| 622 |
+
"guard_triggered": True,
|
| 623 |
+
"issues": issues,
|
| 624 |
+
"abstained": True,
|
| 625 |
+
"raw_rejected": raw,
|
| 626 |
+
"semantic_findings": [],
|
| 627 |
+
"semantic_fail_rate": np.nan,
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
# semantic per-sentence check
|
| 631 |
+
findings = []
|
| 632 |
+
semantic_fail_rate = np.nan
|
| 633 |
+
if do_semantic_check and ABSTAIN_MESSAGE not in raw:
|
| 634 |
+
chunk_lookup = _build_chunk_lookup(concept_results, paper_results)
|
| 635 |
+
findings = semantic_per_sentence_check(raw, chunk_lookup)
|
| 636 |
+
if findings:
|
| 637 |
+
n_failed = sum(1 for f in findings if f["label"] in ("contradicted", "insufficient"))
|
| 638 |
+
semantic_fail_rate = n_failed / len(findings)
|
| 639 |
+
|
| 640 |
+
if semantic_fail_rate > SEMANTIC_FAIL_THRESHOLD:
|
| 641 |
+
return {
|
| 642 |
+
"answer": ABSTAIN_MESSAGE,
|
| 643 |
+
"valid_citations": True,
|
| 644 |
+
"guard_triggered": True,
|
| 645 |
+
"issues": [f"semantic check failed: {n_failed}/{len(findings)} sentences unsupported"],
|
| 646 |
+
"abstained": True,
|
| 647 |
+
"raw_rejected": raw,
|
| 648 |
+
"semantic_findings": findings,
|
| 649 |
+
"semantic_fail_rate": semantic_fail_rate,
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
raw = annotate_unsupported_sentences(raw, findings)
|
| 653 |
+
|
| 654 |
+
return {
|
| 655 |
+
"answer": raw,
|
| 656 |
+
"valid_citations": True,
|
| 657 |
+
"guard_triggered": False,
|
| 658 |
+
"issues": [],
|
| 659 |
+
"abstained": ABSTAIN_MESSAGE in raw,
|
| 660 |
+
"semantic_findings": findings,
|
| 661 |
+
"semantic_fail_rate": semantic_fail_rate,
|
| 662 |
+
}
|
| 663 |
+
|
| 664 |
+
"""## 9. LLM-as-judge metrics
|
| 665 |
+
|
| 666 |
+
|
| 667 |
+
"""
|
| 668 |
+
|
| 669 |
+
def _coerce_score(x):
|
| 670 |
+
try:
|
| 671 |
+
v = int(float(x))
|
| 672 |
+
except Exception:
|
| 673 |
+
v = 0
|
| 674 |
+
return max(0, min(2, v))
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
def _single_judge_call(system_prompt, user_prompt):
|
| 678 |
+
|
| 679 |
+
try:
|
| 680 |
+
resp = client_oai.chat.completions.create(
|
| 681 |
+
model=JUDGE_MODEL,
|
| 682 |
+
temperature=JUDGE_TEMPERATURE,
|
| 683 |
+
response_format={"type": "json_object"},
|
| 684 |
+
messages=[
|
| 685 |
+
{"role": "system", "content": system_prompt},
|
| 686 |
+
{"role": "user", "content": user_prompt},
|
| 687 |
+
],
|
| 688 |
+
)
|
| 689 |
+
data = json.loads(resp.choices[0].message.content)
|
| 690 |
+
return {
|
| 691 |
+
"score": _coerce_score(data.get("score", 0)),
|
| 692 |
+
"reason": str(data.get("reason", "")).strip(),
|
| 693 |
+
}
|
| 694 |
+
except Exception as e:
|
| 695 |
+
return {"score": None, "reason": f"ERROR: {e}"}
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
def _judge_call_parallel(system_prompt, user_prompt, n=None):
|
| 699 |
+
"""Run n judge calls in parallel via ThreadPoolExecutor."""
|
| 700 |
+
n = n or JUDGE_N_SAMPLES
|
| 701 |
+
results = [None] * n
|
| 702 |
+
with ThreadPoolExecutor(max_workers=min(n, JUDGE_PARALLELISM)) as ex:
|
| 703 |
+
futures = {ex.submit(_single_judge_call, system_prompt, user_prompt): i
|
| 704 |
+
for i in range(n)}
|
| 705 |
+
for fut in as_completed(futures):
|
| 706 |
+
i = futures[fut]
|
| 707 |
+
results[i] = fut.result()
|
| 708 |
+
return results
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
def _aggregate(runs):
|
| 712 |
+
valid = [r for r in runs if r["score"] is not None]
|
| 713 |
+
if not valid:
|
| 714 |
+
return {"score": None, "reasons": [r["reason"] for r in runs], "n_valid": 0}
|
| 715 |
+
return {
|
| 716 |
+
"score": sum(r["score"] for r in valid) / len(valid),
|
| 717 |
+
"reasons": [r["reason"] for r in valid],
|
| 718 |
+
"n_valid": len(valid),
|
| 719 |
+
}
|
| 720 |
+
|
| 721 |
+
|
| 722 |
+
CORRECTNESS_SYSTEM = """You are evaluating answer correctness for a question about a NeuroAI paper passage.
|
| 723 |
+
|
| 724 |
+
Given a passage, a reference explanation (gold-standard), and a system explanation, score the system explanation's correctness using ONLY the information in the passage and reference.
|
| 725 |
+
|
| 726 |
+
Return ONLY a JSON object:
|
| 727 |
+
{"score": <int 0/1/2>, "reason": "<one sentence>"}
|
| 728 |
+
|
| 729 |
+
Scoring scale:
|
| 730 |
+
- 0 = wrong (contradicts the passage or says something incorrect)
|
| 731 |
+
- 1 = partly correct (captures some but not all of the main idea, or adds unsupported claims)
|
| 732 |
+
- 2 = correct (faithful to what the passage actually says)
|
| 733 |
+
"""
|
| 734 |
+
|
| 735 |
+
def score_correctness(passage, reference, candidate):
|
| 736 |
+
user = f"PASSAGE:\n{passage}\n\nREFERENCE:\n{reference}\n\nSYSTEM EXPLANATION:\n{candidate}"
|
| 737 |
+
runs = _judge_call_parallel(CORRECTNESS_SYSTEM, user)
|
| 738 |
+
return _aggregate(runs)
|
| 739 |
+
|
| 740 |
+
|
| 741 |
+
EVIDENCE_SYSTEM = """You are evaluating whether a system explanation's key claims are supported by retrieved context.
|
| 742 |
+
|
| 743 |
+
Given a passage, the retrieved context that was shown to the system, and the system's explanation, score whether the explanation's factual claims are well-supported by the retrieved context.
|
| 744 |
+
|
| 745 |
+
Return ONLY a JSON object:
|
| 746 |
+
{"score": <int 0/1/2>, "reason": "<one sentence>"}
|
| 747 |
+
|
| 748 |
+
Scoring scale:
|
| 749 |
+
- 0 = unsupported (most claims cannot be found in retrieved context)
|
| 750 |
+
- 1 = partly supported (some claims supported, others require outside knowledge)
|
| 751 |
+
- 2 = well supported (claims are traceable to retrieved context)
|
| 752 |
+
|
| 753 |
+
If the retrieved context is empty (no RAG baseline), score 0.
|
| 754 |
+
"""
|
| 755 |
+
|
| 756 |
+
def score_evidence_support(passage, retrieved_context, candidate):
|
| 757 |
+
user = f"PASSAGE:\n{passage}\n\nRETRIEVED CONTEXT:\n{retrieved_context}\n\nSYSTEM EXPLANATION:\n{candidate}"
|
| 758 |
+
runs = _judge_call_parallel(EVIDENCE_SYSTEM, user)
|
| 759 |
+
return _aggregate(runs)
|
| 760 |
+
|
| 761 |
+
|
| 762 |
+
CITATION_SYSTEM = """You are evaluating whether citations in a system explanation are faithful.
|
| 763 |
+
|
| 764 |
+
The system was asked to cite each factual sentence with an ID from the retrieved context (like [c004] or [arxiv_2511_12345::c3]). Given the retrieved context and the system explanation with citations, score whether the citations are relevant and the cited material actually supports the adjacent claim.
|
| 765 |
+
|
| 766 |
+
Return ONLY a JSON object:
|
| 767 |
+
{"score": <int 0/1/2>, "reason": "<one sentence>"}
|
| 768 |
+
|
| 769 |
+
Scoring scale:
|
| 770 |
+
- 0 = unfaithful (citations invented, missing, or do not support adjacent claims)
|
| 771 |
+
- 1 = mixed (some citations support their claims, others do not)
|
| 772 |
+
- 2 = faithful (citations are present, relevant, and support adjacent claims)
|
| 773 |
+
|
| 774 |
+
If the answer is the abstention message ("I don't have enough evidence..."), score 2 (correctly declined).
|
| 775 |
+
"""
|
| 776 |
+
|
| 777 |
+
def score_citation_faithfulness(retrieved_context, candidate):
|
| 778 |
+
user = f"RETRIEVED CONTEXT:\n{retrieved_context}\n\nSYSTEM EXPLANATION:\n{candidate}"
|
| 779 |
+
runs = _judge_call_parallel(CITATION_SYSTEM, user)
|
| 780 |
+
return _aggregate(runs)
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
def score_all_metrics(passage, reference, retrieved_context, candidate):
|
| 784 |
+
"""Run all three metrics in parallel."""
|
| 785 |
+
with ThreadPoolExecutor(max_workers=3) as ex:
|
| 786 |
+
f_c = ex.submit(score_correctness, passage, reference, candidate)
|
| 787 |
+
f_e = ex.submit(score_evidence_support, passage, retrieved_context, candidate)
|
| 788 |
+
f_f = ex.submit(score_citation_faithfulness, retrieved_context, candidate)
|
| 789 |
+
return {
|
| 790 |
+
"correctness": f_c.result(),
|
| 791 |
+
"evidence_support": f_e.result(),
|
| 792 |
+
"citation_faithfulness": f_f.result(),
|
| 793 |
+
}
|
| 794 |
+
|
| 795 |
+
"""## 10. Claim-based faithfulness
|
| 796 |
+
|
| 797 |
+
"""
|
| 798 |
+
|
| 799 |
+
CLAIM_EXTRACTION_SYSTEM = """Extract atomic factual claims from the given answer.
|
| 800 |
+
|
| 801 |
+
Return ONLY a JSON object:
|
| 802 |
+
{"claims": ["claim 1", "claim 2", ...]}
|
| 803 |
+
|
| 804 |
+
Rules:
|
| 805 |
+
- Each claim should be a single, minimal factual assertion
|
| 806 |
+
- Ignore pure formatting, headers, or meta-commentary
|
| 807 |
+
- Skip citation markers like [c004] when extracting claims
|
| 808 |
+
- If there are no factual claims, return {"claims": []}
|
| 809 |
+
"""
|
| 810 |
+
|
| 811 |
+
EVIDENCE_EXTRACTION_SYSTEM = """Extract factual assertions from the given text chunk.
|
| 812 |
+
|
| 813 |
+
Return ONLY a JSON object:
|
| 814 |
+
{"assertions": ["assertion 1", "assertion 2", ...]}
|
| 815 |
+
|
| 816 |
+
Rules:
|
| 817 |
+
- One atomic factual assertion per entry
|
| 818 |
+
- Skip anything that is a question, opinion, or example
|
| 819 |
+
- If there are no assertions, return {"assertions": []}
|
| 820 |
+
"""
|
| 821 |
+
|
| 822 |
+
CLAIM_VERIFICATION_SYSTEM = """Classify if a claim is supported, contradicted, or insufficient given evidence.
|
| 823 |
+
|
| 824 |
+
Return ONLY a JSON object:
|
| 825 |
+
{"label": "supported" | "contradicted" | "insufficient", "reason": "<one short sentence>"}
|
| 826 |
+
|
| 827 |
+
Definitions:
|
| 828 |
+
- supported: the evidence directly supports the claim
|
| 829 |
+
- contradicted: the evidence contradicts the claim
|
| 830 |
+
- insufficient: the evidence is silent or unclear on the claim
|
| 831 |
+
"""
|
| 832 |
+
|
| 833 |
+
def _json_call(system_prompt, user_prompt, model=None):
|
| 834 |
+
model = model or JUDGE_MODEL
|
| 835 |
+
resp = client_oai.chat.completions.create(
|
| 836 |
+
model=model,
|
| 837 |
+
temperature=JUDGE_TEMPERATURE,
|
| 838 |
+
response_format={"type": "json_object"},
|
| 839 |
+
messages=[
|
| 840 |
+
{"role": "system", "content": system_prompt},
|
| 841 |
+
{"role": "user", "content": user_prompt},
|
| 842 |
+
],
|
| 843 |
+
)
|
| 844 |
+
try:
|
| 845 |
+
return json.loads(resp.choices[0].message.content)
|
| 846 |
+
except Exception:
|
| 847 |
+
return {}
|
| 848 |
+
|
| 849 |
+
|
| 850 |
+
def extract_claims(answer):
|
| 851 |
+
data = _json_call(CLAIM_EXTRACTION_SYSTEM, f"ANSWER:\n{answer}")
|
| 852 |
+
return [c for c in data.get("claims", []) if c and isinstance(c, str)]
|
| 853 |
+
|
| 854 |
+
|
| 855 |
+
_ASSERTION_CACHE = {}
|
| 856 |
+
|
| 857 |
+
def extract_assertions_from_chunk(chunk):
|
| 858 |
+
key = hash(chunk)
|
| 859 |
+
if key in _ASSERTION_CACHE:
|
| 860 |
+
return _ASSERTION_CACHE[key]
|
| 861 |
+
data = _json_call(EVIDENCE_EXTRACTION_SYSTEM, f"CHUNK:\n{chunk}")
|
| 862 |
+
out = [a for a in data.get("assertions", []) if a and isinstance(a, str)]
|
| 863 |
+
_ASSERTION_CACHE[key] = out
|
| 864 |
+
return out
|
| 865 |
+
|
| 866 |
+
|
| 867 |
+
def _normalize_label(label):
|
| 868 |
+
x = (label or "").strip().lower()
|
| 869 |
+
if "support" in x: return "supported"
|
| 870 |
+
if "contrad" in x: return "contradicted"
|
| 871 |
+
return "insufficient"
|
| 872 |
+
|
| 873 |
+
|
| 874 |
+
def verify_claim_against_evidence(claim, assertions):
|
| 875 |
+
evidence_blob = "\n".join(assertions) if assertions else "NO_EVIDENCE"
|
| 876 |
+
data = _json_call(
|
| 877 |
+
CLAIM_VERIFICATION_SYSTEM,
|
| 878 |
+
f"CLAIM:\n{claim}\n\nEVIDENCE:\n{evidence_blob}"
|
| 879 |
+
)
|
| 880 |
+
return {
|
| 881 |
+
"label": _normalize_label(data.get("label")),
|
| 882 |
+
"reason": str(data.get("reason", "")).strip(),
|
| 883 |
+
}
|
| 884 |
+
|
| 885 |
+
|
| 886 |
+
def claim_based_faithfulness(answer, retrieved_chunks):
|
| 887 |
+
if ABSTAIN_MESSAGE in answer:
|
| 888 |
+
return {
|
| 889 |
+
"n_claims": 0,
|
| 890 |
+
"support_rate": np.nan,
|
| 891 |
+
"contradiction_rate": np.nan,
|
| 892 |
+
"unsupported_rate": np.nan,
|
| 893 |
+
"abstained": True,
|
| 894 |
+
"details": [],
|
| 895 |
+
}
|
| 896 |
+
|
| 897 |
+
claims = extract_claims(answer)
|
| 898 |
+
if not claims:
|
| 899 |
+
return {
|
| 900 |
+
"n_claims": 0,
|
| 901 |
+
"support_rate": np.nan,
|
| 902 |
+
"contradiction_rate": np.nan,
|
| 903 |
+
"unsupported_rate": np.nan,
|
| 904 |
+
"abstained": False,
|
| 905 |
+
"details": [],
|
| 906 |
+
}
|
| 907 |
+
|
| 908 |
+
|
| 909 |
+
with ThreadPoolExecutor(max_workers=JUDGE_PARALLELISM) as ex:
|
| 910 |
+
all_assertions_lists = list(ex.map(extract_assertions_from_chunk, retrieved_chunks))
|
| 911 |
+
all_assertions = [a for sub in all_assertions_lists for a in sub]
|
| 912 |
+
|
| 913 |
+
with ThreadPoolExecutor(max_workers=JUDGE_PARALLELISM) as ex:
|
| 914 |
+
verify_results = list(ex.map(
|
| 915 |
+
lambda c: verify_claim_against_evidence(c, all_assertions),
|
| 916 |
+
claims
|
| 917 |
+
))
|
| 918 |
+
labels = [r["label"] for r in verify_results]
|
| 919 |
+
details = [{"claim": c, **r} for c, r in zip(claims, verify_results)]
|
| 920 |
+
|
| 921 |
+
n = len(labels)
|
| 922 |
+
return {
|
| 923 |
+
"n_claims": n,
|
| 924 |
+
"support_rate": sum(1 for l in labels if l == "supported") / n,
|
| 925 |
+
"contradiction_rate": sum(1 for l in labels if l == "contradicted") / n,
|
| 926 |
+
"unsupported_rate": sum(1 for l in labels if l == "insufficient") / n,
|
| 927 |
+
"abstained": False,
|
| 928 |
+
"details": details,
|
| 929 |
+
}
|
| 930 |
+
|
| 931 |
+
"""## 11. Retrieval precision@k / recall@k and bootstrap CIs"""
|
| 932 |
+
|
| 933 |
+
def precision_recall_at_k(retrieved_chunks, gold_facts, k=3):
|
| 934 |
+
if not gold_facts:
|
| 935 |
+
return np.nan, np.nan
|
| 936 |
+
top_k = retrieved_chunks[:k]
|
| 937 |
+
if not top_k:
|
| 938 |
+
return 0.0, 0.0
|
| 939 |
+
rel_flags = []
|
| 940 |
+
for chunk in top_k:
|
| 941 |
+
c = chunk.lower()
|
| 942 |
+
is_rel = any(fact.lower() in c for fact in gold_facts)
|
| 943 |
+
rel_flags.append(is_rel)
|
| 944 |
+
precision = float(np.mean(rel_flags))
|
| 945 |
+
covered = 0
|
| 946 |
+
for fact in gold_facts:
|
| 947 |
+
if any(fact.lower() in chunk.lower() for chunk in top_k):
|
| 948 |
+
covered += 1
|
| 949 |
+
recall = covered / len(gold_facts)
|
| 950 |
+
return precision, recall
|
| 951 |
+
|
| 952 |
+
|
| 953 |
+
def bootstrap_ci(values, n_boot=None, alpha=None):
|
| 954 |
+
n_boot = n_boot or BOOTSTRAP_N
|
| 955 |
+
alpha = alpha or BOOTSTRAP_ALPHA
|
| 956 |
+
values = np.array(values, dtype=float)
|
| 957 |
+
values = values[~np.isnan(values)]
|
| 958 |
+
if len(values) == 0:
|
| 959 |
+
return np.nan, np.nan, np.nan
|
| 960 |
+
boots = np.empty(n_boot)
|
| 961 |
+
n = len(values)
|
| 962 |
+
for i in range(n_boot):
|
| 963 |
+
sample = _rng.choice(values, size=n, replace=True)
|
| 964 |
+
boots[i] = sample.mean()
|
| 965 |
+
lo = np.percentile(boots, 100 * (alpha / 2))
|
| 966 |
+
hi = np.percentile(boots, 100 * (1 - alpha / 2))
|
| 967 |
+
return float(values.mean()), float(lo), float(hi)
|
| 968 |
+
|
| 969 |
+
|
| 970 |
+
def format_ci(values, digits=3):
|
| 971 |
+
m, lo, hi = bootstrap_ci(values)
|
| 972 |
+
return f"{m:.{digits}f} [{lo:.{digits}f}, {hi:.{digits}f}]"
|
| 973 |
+
|
| 974 |
+
"""## 12. Logging"""
|
| 975 |
+
|
| 976 |
+
EVAL_LOG_DIR = Path("./eval_logs")
|
| 977 |
+
EVAL_LOG_DIR.mkdir(exist_ok=True)
|
| 978 |
+
|
| 979 |
+
def log_eval_row(experiment_id, passage_id, variant, retrieved_sources,
|
| 980 |
+
generation_result, judge_scores, extra=None):
|
| 981 |
+
row = {
|
| 982 |
+
"experiment_id": experiment_id,
|
| 983 |
+
"passage_id": passage_id,
|
| 984 |
+
"variant": variant,
|
| 985 |
+
"model": GENERATOR_MODEL,
|
| 986 |
+
"n_retrieved": len(retrieved_sources),
|
| 987 |
+
"retrieved_chunk_ids": ";".join(
|
| 988 |
+
str(r.get("chunk_id") or r.get("concept_id") or "?") for r in retrieved_sources
|
| 989 |
+
),
|
| 990 |
+
"guard_triggered": int(generation_result.get("guard_triggered", False)),
|
| 991 |
+
"abstained": int(generation_result.get("abstained", False)),
|
| 992 |
+
"answer_chars": len(generation_result.get("answer", "")),
|
| 993 |
+
"generated_text": generation_result.get("answer", ""),
|
| 994 |
+
"correctness": judge_scores.get("correctness", {}).get("score"),
|
| 995 |
+
"evidence_support": judge_scores.get("evidence_support", {}).get("score"),
|
| 996 |
+
"citation_faithfulness": judge_scores.get("citation_faithfulness", {}).get("score"),
|
| 997 |
+
"semantic_fail_rate": generation_result.get("semantic_fail_rate", np.nan),
|
| 998 |
+
}
|
| 999 |
+
if extra:
|
| 1000 |
+
row.update(extra)
|
| 1001 |
+
path = EVAL_LOG_DIR / f"{experiment_id}.csv"
|
| 1002 |
+
pd.DataFrame([row]).to_csv(
|
| 1003 |
+
path, mode="a", header=not path.exists(), index=False
|
| 1004 |
+
)
|
| 1005 |
+
return row
|
| 1006 |
+
|
| 1007 |
+
|
| 1008 |
+
def load_or_run_experiment(experiment_id, runner_fn):
|
| 1009 |
+
|
| 1010 |
+
local_path = EVAL_LOG_DIR / f"{experiment_id}.csv"
|
| 1011 |
+
|
| 1012 |
+
if RUN_EXPERIMENTS:
|
| 1013 |
+
|
| 1014 |
+
if local_path.exists():
|
| 1015 |
+
local_path.unlink()
|
| 1016 |
+
print(f"running {experiment_id} from scratch...")
|
| 1017 |
+
return runner_fn()
|
| 1018 |
+
|
| 1019 |
+
|
| 1020 |
+
url = f"{REPO_RAW_BASE}/eval_logs/{experiment_id}.csv"
|
| 1021 |
+
try:
|
| 1022 |
+
df = pd.read_csv(url)
|
| 1023 |
+
print(f"loaded {experiment_id} from repo cache: {len(df)} rows")
|
| 1024 |
+
|
| 1025 |
+
df.to_csv(local_path, index=False)
|
| 1026 |
+
return df
|
| 1027 |
+
except Exception:
|
| 1028 |
+
pass
|
| 1029 |
+
|
| 1030 |
+
|
| 1031 |
+
if local_path.exists():
|
| 1032 |
+
df = pd.read_csv(local_path)
|
| 1033 |
+
print(f"loaded {experiment_id} from local cache: {len(df)} rows")
|
| 1034 |
+
return df
|
| 1035 |
+
|
| 1036 |
+
print(f"⚠ no cached results for {experiment_id}. Set RUN_EXPERIMENTS=True to generate.")
|
| 1037 |
+
return None
|
| 1038 |
+
|
| 1039 |
+
"""## 13. Judge calibration
|
| 1040 |
+
|
| 1041 |
+
|
| 1042 |
+
"""
|
| 1043 |
+
|
| 1044 |
+
RUN_CALIBRATION = False # HERE
|
| 1045 |
+
|
| 1046 |
+
def calibrate_judge(n_items=5):
|
| 1047 |
+
items = [x for x in validation_set["items"]]
|
| 1048 |
+
sample = items[:n_items]
|
| 1049 |
+
diffs = {"correctness": [], "evidence_support": [], "citation_faithfulness": []}
|
| 1050 |
+
|
| 1051 |
+
for item in sample:
|
| 1052 |
+
c, p = retrieve_for_variant(item["passage"], "hybrid")
|
| 1053 |
+
result = generate_with_citation_guard(item["passage"], c, p, do_semantic_check=False)
|
| 1054 |
+
explanation = result["answer"]
|
| 1055 |
+
context_text = _format_context_block(c, p)
|
| 1056 |
+
|
| 1057 |
+
print("=" * 70)
|
| 1058 |
+
print(f"ITEM {item['id']}")
|
| 1059 |
+
print(f"PASSAGE: {item['passage'][:300]}")
|
| 1060 |
+
print(f"\nREFERENCE: {item['reference_explanation']}")
|
| 1061 |
+
print(f"\nSYSTEM EXPLANATION:\n{explanation}")
|
| 1062 |
+
print("\nScore each metric 0/1/2 (0=bad, 1=partial, 2=good):")
|
| 1063 |
+
try:
|
| 1064 |
+
human = {
|
| 1065 |
+
"correctness": int(input(" correctness: ")),
|
| 1066 |
+
"evidence_support": int(input(" evidence_support: ")),
|
| 1067 |
+
"citation_faithfulness": int(input(" citation_faithfulness: ")),
|
| 1068 |
+
}
|
| 1069 |
+
except (ValueError, EOFError):
|
| 1070 |
+
print("aborted")
|
| 1071 |
+
return None
|
| 1072 |
+
|
| 1073 |
+
all_scores = score_all_metrics(
|
| 1074 |
+
item["passage"], item["reference_explanation"], context_text, explanation
|
| 1075 |
+
)
|
| 1076 |
+
scores_clean = {k: all_scores[k]["score"] for k in all_scores}
|
| 1077 |
+
for k in diffs:
|
| 1078 |
+
if scores_clean[k] is not None:
|
| 1079 |
+
diffs[k].append(abs(human[k] - scores_clean[k]))
|
| 1080 |
+
|
| 1081 |
+
print("\n=== CALIBRATION RESULTS ===")
|
| 1082 |
+
for k, vals in diffs.items():
|
| 1083 |
+
if vals:
|
| 1084 |
+
mad = sum(vals) / len(vals)
|
| 1085 |
+
flag = " ⚠ DISAGREES" if mad > 0.5 else " ok"
|
| 1086 |
+
print(f" {k}: mean abs diff = {mad:.2f}{flag}")
|
| 1087 |
+
return diffs
|
| 1088 |
+
|
| 1089 |
+
|
| 1090 |
+
if RUN_CALIBRATION:
|
| 1091 |
+
calibrate_judge(n_items=5)
|
| 1092 |
+
else:
|
| 1093 |
+
print("calibration skipped (RUN_CALIBRATION=False)")
|
| 1094 |
+
print("Last calibration: correctness MAD=0.60 (DISAGREES), evidence MAD=0.40, citation MAD=0.20")
|
| 1095 |
+
|
| 1096 |
+
"""## 14. Experiment A — retrieval ablation
|
| 1097 |
+
|
| 1098 |
+
**Question.** Does RAG help, and does the regex tier earn its place?
|
| 1099 |
+
|
| 1100 |
+
**Hypothesis.** All RAG variants will beat the no-RAG baseline on claim_support_rate and evidence_support. Hybrid will beat either single-tier variant.
|
| 1101 |
+
|
| 1102 |
+
**Variable changed.** Retrieval method ∈ {no_rag, embedding_only, regex_only, hybrid}. Everything else held constant.
|
| 1103 |
+
"""
|
| 1104 |
+
|
| 1105 |
+
def run_experiment_A():
|
| 1106 |
+
items = [x for x in validation_set["items"]]
|
| 1107 |
+
variants = ["no_rag", "embedding_only", "regex_only", "hybrid"]
|
| 1108 |
+
total_runs = len(items) * len(variants)
|
| 1109 |
+
print(f"running experiment A: {len(items)} items × {len(variants)} variants = {total_runs} runs")
|
| 1110 |
+
|
| 1111 |
+
for i, item in enumerate(items):
|
| 1112 |
+
for variant in variants:
|
| 1113 |
+
try:
|
| 1114 |
+
c, p = retrieve_for_variant(item["passage"], variant)
|
| 1115 |
+
retrieved = c + p
|
| 1116 |
+
context_text = _format_context_block(c, p)
|
| 1117 |
+
|
| 1118 |
+
|
| 1119 |
+
result = generate_with_citation_guard(
|
| 1120 |
+
item["passage"], c, p,
|
| 1121 |
+
allow_no_context_bypass=(variant == "no_rag"),
|
| 1122 |
+
do_semantic_check=False,
|
| 1123 |
+
)
|
| 1124 |
+
|
| 1125 |
+
scores = score_all_metrics(
|
| 1126 |
+
item["passage"], item["reference_explanation"],
|
| 1127 |
+
context_text, result["answer"],
|
| 1128 |
+
)
|
| 1129 |
+
cb = claim_based_faithfulness(
|
| 1130 |
+
result["answer"], [r["content"] for r in retrieved],
|
| 1131 |
+
)
|
| 1132 |
+
rp, rr = precision_recall_at_k(
|
| 1133 |
+
[r["content"] for r in retrieved], item["key_terms"], k=3,
|
| 1134 |
+
)
|
| 1135 |
+
|
| 1136 |
+
log_eval_row(
|
| 1137 |
+
"experiment_A", item["id"], variant,
|
| 1138 |
+
retrieved, result, scores,
|
| 1139 |
+
extra={
|
| 1140 |
+
"category": item["category"],
|
| 1141 |
+
"claim_support_rate": cb["support_rate"],
|
| 1142 |
+
"claim_contradiction_rate": cb["contradiction_rate"],
|
| 1143 |
+
"claim_unsupported_rate": cb["unsupported_rate"],
|
| 1144 |
+
"n_claims": cb["n_claims"],
|
| 1145 |
+
"retrieval_precision_at_3": rp,
|
| 1146 |
+
"retrieval_recall_at_3": rr,
|
| 1147 |
+
}
|
| 1148 |
+
)
|
| 1149 |
+
except Exception as e:
|
| 1150 |
+
print(f" ERROR {item['id']}/{variant}: {e}")
|
| 1151 |
+
print(f" done {item['id']} ({i+1}/{len(items)})")
|
| 1152 |
+
|
| 1153 |
+
return pd.read_csv(EVAL_LOG_DIR / "experiment_A.csv")
|
| 1154 |
+
|
| 1155 |
+
|
| 1156 |
+
experiment_A_df = load_or_run_experiment("experiment_A", run_experiment_A)
|
| 1157 |
+
|
| 1158 |
+
def analyze_experiment_A():
|
| 1159 |
+
df = pd.read_csv(EVAL_LOG_DIR / "experiment_A.csv")
|
| 1160 |
+
metric_cols = ["correctness", "evidence_support", "citation_faithfulness",
|
| 1161 |
+
"claim_support_rate", "retrieval_recall_at_3", "abstained"]
|
| 1162 |
+
|
| 1163 |
+
print("=" * 70)
|
| 1164 |
+
print("OVERALL means with 95% bootstrap CIs")
|
| 1165 |
+
print("=" * 70)
|
| 1166 |
+
for variant in ["no_rag", "embedding_only", "regex_only", "hybrid"]:
|
| 1167 |
+
sub = df[df.variant == variant]
|
| 1168 |
+
print(f"\n{variant}")
|
| 1169 |
+
for m in metric_cols:
|
| 1170 |
+
if m in sub.columns:
|
| 1171 |
+
print(f" {m:28s} {format_ci(sub[m].values)}")
|
| 1172 |
+
|
| 1173 |
+
print("\n" + "=" * 70)
|
| 1174 |
+
print("HEADLINE METRIC: claim_support_rate (correctness saturates — see report)")
|
| 1175 |
+
print("=" * 70)
|
| 1176 |
+
for variant in ["no_rag", "embedding_only", "regex_only", "hybrid"]:
|
| 1177 |
+
sub = df[df.variant == variant]
|
| 1178 |
+
if "claim_support_rate" in sub.columns:
|
| 1179 |
+
print(f" {variant:18s} {format_ci(sub['claim_support_rate'].values)}")
|
| 1180 |
+
|
| 1181 |
+
return df
|
| 1182 |
+
|
| 1183 |
+
|
| 1184 |
+
def plot_experiment_A():
|
| 1185 |
+
df = pd.read_csv(EVAL_LOG_DIR / "experiment_A.csv")
|
| 1186 |
+
variant_order = ["no_rag", "embedding_only", "regex_only", "hybrid"]
|
| 1187 |
+
colors = ["#888", "#4c72b0", "#dd8452", "#55a868"]
|
| 1188 |
+
|
| 1189 |
+
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
|
| 1190 |
+
|
| 1191 |
+
|
| 1192 |
+
means, los, his = [], [], []
|
| 1193 |
+
for v in variant_order:
|
| 1194 |
+
sub = df[df.variant == v]
|
| 1195 |
+
if "claim_support_rate" in sub.columns:
|
| 1196 |
+
m, lo, hi = bootstrap_ci(sub["claim_support_rate"].values)
|
| 1197 |
+
else:
|
| 1198 |
+
m, lo, hi = 0, 0, 0
|
| 1199 |
+
means.append(m); los.append(m - lo); his.append(hi - m)
|
| 1200 |
+
axes[0].bar(variant_order, means, yerr=[los, his], color=colors, capsize=5)
|
| 1201 |
+
axes[0].set_title("Claim support rate (headline)")
|
| 1202 |
+
axes[0].set_ylabel("Fraction of claims supported")
|
| 1203 |
+
axes[0].set_ylim(0, 1)
|
| 1204 |
+
axes[0].tick_params(axis="x", rotation=20)
|
| 1205 |
+
|
| 1206 |
+
|
| 1207 |
+
means, los, his = [], [], []
|
| 1208 |
+
for v in variant_order:
|
| 1209 |
+
sub = df[df.variant == v]
|
| 1210 |
+
if "retrieval_recall_at_3" in sub.columns:
|
| 1211 |
+
m, lo, hi = bootstrap_ci(sub["retrieval_recall_at_3"].values)
|
| 1212 |
+
else:
|
| 1213 |
+
m, lo, hi = 0, 0, 0
|
| 1214 |
+
means.append(m); los.append(m - lo); his.append(hi - m)
|
| 1215 |
+
axes[1].bar(variant_order, means, yerr=[los, his], color=colors, capsize=5)
|
| 1216 |
+
axes[1].set_title("Retrieval recall@3")
|
| 1217 |
+
axes[1].set_ylabel("Fraction of gold key_terms covered")
|
| 1218 |
+
axes[1].set_ylim(0, 1)
|
| 1219 |
+
axes[1].tick_params(axis="x", rotation=20)
|
| 1220 |
+
|
| 1221 |
+
|
| 1222 |
+
abs_by_var = df.groupby("variant")["abstained"].mean().reindex(variant_order)
|
| 1223 |
+
axes[2].bar(variant_order, abs_by_var.values, color=colors)
|
| 1224 |
+
axes[2].set_title("Abstention rate")
|
| 1225 |
+
axes[2].set_ylabel("Fraction of items guard triggered")
|
| 1226 |
+
axes[2].set_ylim(0, 1)
|
| 1227 |
+
axes[2].tick_params(axis="x", rotation=20)
|
| 1228 |
+
|
| 1229 |
+
plt.tight_layout()
|
| 1230 |
+
plt.show()
|
| 1231 |
+
|
| 1232 |
+
|
| 1233 |
+
if experiment_A_df is not None:
|
| 1234 |
+
analyze_experiment_A()
|
| 1235 |
+
plot_experiment_A()
|
| 1236 |
+
|
| 1237 |
+
"""### Release gate"""
|
| 1238 |
+
|
| 1239 |
+
def release_gate_A(variant="hybrid"):
|
| 1240 |
+
df = pd.read_csv(EVAL_LOG_DIR / "experiment_A.csv")
|
| 1241 |
+
sub = df[df.variant == variant]
|
| 1242 |
+
|
| 1243 |
+
thresholds = {
|
| 1244 |
+
"claim_support_rate": 0.70, # primary
|
| 1245 |
+
"evidence_support": 1.40,
|
| 1246 |
+
"citation_faithfulness": 1.40,
|
| 1247 |
+
"retrieval_recall_at_3": 0.60,
|
| 1248 |
+
"abstained": 0.30,
|
| 1249 |
+
}
|
| 1250 |
+
lower_is_better = {"abstained"}
|
| 1251 |
+
|
| 1252 |
+
agg = {k: float(np.nanmean(sub[k].values)) for k in thresholds if k in sub.columns}
|
| 1253 |
+
|
| 1254 |
+
print(f"Release gate for variant: {variant}")
|
| 1255 |
+
print("=" * 60)
|
| 1256 |
+
all_pass = True
|
| 1257 |
+
for k, t in thresholds.items():
|
| 1258 |
+
if k not in agg: continue
|
| 1259 |
+
v = agg[k]
|
| 1260 |
+
ok = (v <= t) if k in lower_is_better else (v >= t)
|
| 1261 |
+
direction = "≤" if k in lower_is_better else "≥"
|
| 1262 |
+
status = "PASS" if ok else "FAIL"
|
| 1263 |
+
print(f" {k:28s} {v:.3f} (need {direction} {t}) {status}")
|
| 1264 |
+
all_pass = all_pass and ok
|
| 1265 |
+
print(f"\nFINAL: {'PASS' if all_pass else 'FAIL'}")
|
| 1266 |
+
return all_pass
|
| 1267 |
+
|
| 1268 |
+
|
| 1269 |
+
if experiment_A_df is not None:
|
| 1270 |
+
release_gate_A(variant="hybrid")
|
| 1271 |
+
print()
|
| 1272 |
+
release_gate_A(variant="regex_only")
|
| 1273 |
+
|
| 1274 |
+
"""## 16. Experiment B — top-k sweep
|
| 1275 |
+
|
| 1276 |
+
**Question:** How does the number of retrieved sources (top-k) affect answer correctness?
|
| 1277 |
+
|
| 1278 |
+
**Hypothesis:** Performance peaks somewhere in the middle. k=1 misses context; large k dilutes the prompt with irrelevant chunks.
|
| 1279 |
+
|
| 1280 |
+
**Variable changed:** `top_k ∈ {1, 3, 5, 7}`, applied to both retrieval tiers. Hybrid retrieval; everything else held constant.
|
| 1281 |
+
"""
|
| 1282 |
+
|
| 1283 |
+
def run_experiment_B(top_k_values=(1, 3, 5, 7)):
|
| 1284 |
+
items = [x for x in validation_set["items"]]
|
| 1285 |
+
print(f"running experiment B: {len(items)} items × {len(top_k_values)} top-k values "
|
| 1286 |
+
f"= {len(items) * len(top_k_values)} runs")
|
| 1287 |
+
|
| 1288 |
+
for k in top_k_values:
|
| 1289 |
+
time.sleep(1.5)
|
| 1290 |
+
print(f"\n--- top_k = {k} ---")
|
| 1291 |
+
for item in items:
|
| 1292 |
+
try:
|
| 1293 |
+
c, p = retrieve_for_variant(
|
| 1294 |
+
item["passage"], "hybrid", n_concepts=k, n_papers=k,
|
| 1295 |
+
)
|
| 1296 |
+
retrieved = c + p
|
| 1297 |
+
context_text = _format_context_block(c, p)
|
| 1298 |
+
result = generate_with_citation_guard(
|
| 1299 |
+
item["passage"], c, p, do_semantic_check=False
|
| 1300 |
+
)
|
| 1301 |
+
scores = score_all_metrics(
|
| 1302 |
+
item["passage"], item["reference_explanation"],
|
| 1303 |
+
context_text, result["answer"]
|
| 1304 |
+
)
|
| 1305 |
+
avg_dist = (float(np.mean([r["distance"] for r in retrieved if r["distance"] > 0]))
|
| 1306 |
+
if retrieved else None)
|
| 1307 |
+
|
| 1308 |
+
log_eval_row(
|
| 1309 |
+
"experiment_B", item["id"], f"topk_{k}",
|
| 1310 |
+
retrieved, result, scores,
|
| 1311 |
+
extra={
|
| 1312 |
+
"category": item["category"],
|
| 1313 |
+
"top_k": k,
|
| 1314 |
+
"n_retrieved_total": len(retrieved),
|
| 1315 |
+
"avg_distance": avg_dist,
|
| 1316 |
+
}
|
| 1317 |
+
)
|
| 1318 |
+
except Exception as e:
|
| 1319 |
+
print(f" ERROR {item['id']}: {e}")
|
| 1320 |
+
|
| 1321 |
+
return pd.read_csv(EVAL_LOG_DIR / "experiment_B.csv")
|
| 1322 |
+
|
| 1323 |
+
|
| 1324 |
+
def analyze_experiment_B():
|
| 1325 |
+
df = pd.read_csv(EVAL_LOG_DIR / "experiment_B.csv")
|
| 1326 |
+
print("mean correctness by top-k (with 95% CI):")
|
| 1327 |
+
for k in sorted(df["top_k"].unique()):
|
| 1328 |
+
sub = df[df.top_k == k]
|
| 1329 |
+
print(f" top_k={k} "
|
| 1330 |
+
f"correctness={format_ci(sub['correctness'].values)} "
|
| 1331 |
+
f"evidence={format_ci(sub['evidence_support'].values)} "
|
| 1332 |
+
f"avg_n_retrieved={sub['n_retrieved_total'].mean():.1f}")
|
| 1333 |
+
return df
|
| 1334 |
+
|
| 1335 |
+
|
| 1336 |
+
def plot_experiment_B():
|
| 1337 |
+
df = pd.read_csv(EVAL_LOG_DIR / "experiment_B.csv")
|
| 1338 |
+
ks = sorted(df["top_k"].unique())
|
| 1339 |
+
means, los, his = [], [], []
|
| 1340 |
+
for k in ks:
|
| 1341 |
+
m, lo, hi = bootstrap_ci(df[df.top_k == k]["correctness"].values)
|
| 1342 |
+
means.append(m); los.append(m - lo); his.append(hi - m)
|
| 1343 |
+
|
| 1344 |
+
fig, ax = plt.subplots(figsize=(8, 5))
|
| 1345 |
+
ax.errorbar(ks, means, yerr=[los, his], marker="o", linewidth=2, capsize=5)
|
| 1346 |
+
ax.set_xlabel("top-k (per retrieval tier)")
|
| 1347 |
+
ax.set_ylabel("Mean correctness (0-2)")
|
| 1348 |
+
ax.set_title("Experiment B — Correctness vs top-k (95% CI)")
|
| 1349 |
+
ax.set_xticks(ks)
|
| 1350 |
+
ax.set_ylim(0, 2)
|
| 1351 |
+
ax.grid(alpha=0.3)
|
| 1352 |
+
plt.tight_layout()
|
| 1353 |
+
plt.show()
|
| 1354 |
+
|
| 1355 |
+
|
| 1356 |
+
experiment_B_df = load_or_run_experiment("experiment_B", run_experiment_B)
|
| 1357 |
+
if experiment_B_df is not None:
|
| 1358 |
+
analyze_experiment_B()
|
| 1359 |
+
plot_experiment_B()
|
| 1360 |
+
|
| 1361 |
+
"""## 17. Experiment C — confidence threshold tuning
|
| 1362 |
+
|
| 1363 |
+
**Question.** Where should the low-confidence threshold sit so that warnings correlate with wrong answers?
|
| 1364 |
+
|
| 1365 |
+
**Hypothesis.** The default 1.3 threshold was a guess. The F1-maximizing threshold is probably lower.
|
| 1366 |
+
|
| 1367 |
+
**Variable changed.** Threshold ∈ [0.6, 1.6] by 0.1.
|
| 1368 |
+
"""
|
| 1369 |
+
|
| 1370 |
+
def run_experiment_C():
|
| 1371 |
+
items = [x for x in validation_set["items"]]
|
| 1372 |
+
for item in items:
|
| 1373 |
+
try:
|
| 1374 |
+
time.sleep(1.5)
|
| 1375 |
+
c, p = retrieve_for_variant(item["passage"], "hybrid")
|
| 1376 |
+
all_dists = [r["distance"] for r in (c + p) if r["distance"] > 0]
|
| 1377 |
+
best_dist = float(min(all_dists)) if all_dists else 999.0
|
| 1378 |
+
context_text = _format_context_block(c, p)
|
| 1379 |
+
result = generate_with_citation_guard(
|
| 1380 |
+
item["passage"], c, p, do_semantic_check=False
|
| 1381 |
+
)
|
| 1382 |
+
scores = score_all_metrics(
|
| 1383 |
+
item["passage"], item["reference_explanation"], context_text, result["answer"]
|
| 1384 |
+
)
|
| 1385 |
+
corr = scores["correctness"]["score"]
|
| 1386 |
+
log_eval_row(
|
| 1387 |
+
"experiment_C", item["id"], "default_system",
|
| 1388 |
+
c + p, result, scores,
|
| 1389 |
+
extra={
|
| 1390 |
+
"category": item["category"],
|
| 1391 |
+
"best_distance": best_dist,
|
| 1392 |
+
"correctness_raw": corr,
|
| 1393 |
+
}
|
| 1394 |
+
)
|
| 1395 |
+
except Exception as e:
|
| 1396 |
+
print(f" ERROR {item['id']}: {e}")
|
| 1397 |
+
return pd.read_csv(EVAL_LOG_DIR / "experiment_C.csv")
|
| 1398 |
+
|
| 1399 |
+
|
| 1400 |
+
def analyze_experiment_C():
|
| 1401 |
+
df = pd.read_csv(EVAL_LOG_DIR / "experiment_C.csv")
|
| 1402 |
+
if "correctness_raw" not in df.columns:
|
| 1403 |
+
df["correctness_raw"] = df["correctness"]
|
| 1404 |
+
df = df.dropna(subset=["best_distance", "correctness_raw"])
|
| 1405 |
+
|
| 1406 |
+
|
| 1407 |
+
strict_pos = (df["correctness_raw"] < 1.0).sum()
|
| 1408 |
+
if strict_pos > 0:
|
| 1409 |
+
df["is_wrong"] = (df["correctness_raw"] < 1.0).astype(int)
|
| 1410 |
+
wrongness_def = "correctness < 1.0 (strict)"
|
| 1411 |
+
else:
|
| 1412 |
+
df["is_wrong"] = (df["correctness_raw"] < 2.0).astype(int)
|
| 1413 |
+
wrongness_def = "correctness < 2.0 (saturation fallback)"
|
| 1414 |
+
print(f"using wrongness definition: {wrongness_def}")
|
| 1415 |
+
print(f" positive class size: {df['is_wrong'].sum()}/{len(df)}")
|
| 1416 |
+
if df["is_wrong"].sum() == 0:
|
| 1417 |
+
print("⚠ WARNING: no wrong answers in eval set. Tuning is meaningless on this data.")
|
| 1418 |
+
return None, None, None
|
| 1419 |
+
|
| 1420 |
+
thresholds = [round(0.6 + 0.1 * i, 2) for i in range(11)]
|
| 1421 |
+
rows = []
|
| 1422 |
+
for t in thresholds:
|
| 1423 |
+
warns = df["best_distance"] > t
|
| 1424 |
+
wrong = df["is_wrong"] == 1
|
| 1425 |
+
tp = int(((warns) & (wrong)).sum())
|
| 1426 |
+
fp = int(((warns) & (~wrong)).sum())
|
| 1427 |
+
fn = int(((~warns) & (wrong)).sum())
|
| 1428 |
+
tn = int(((~warns) & (~wrong)).sum())
|
| 1429 |
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
|
| 1430 |
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
| 1431 |
+
f1 = 2*precision*recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
| 1432 |
+
tpr = recall
|
| 1433 |
+
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
|
| 1434 |
+
rows.append({"threshold": t, "tp": tp, "fp": fp, "fn": fn, "tn": tn,
|
| 1435 |
+
"refusal_precision": round(precision, 3),
|
| 1436 |
+
"refusal_recall": round(recall, 3),
|
| 1437 |
+
"f1": round(f1, 3),
|
| 1438 |
+
"tpr": round(tpr, 3), "fpr": round(fpr, 3)})
|
| 1439 |
+
sweep = pd.DataFrame(rows)
|
| 1440 |
+
print(sweep)
|
| 1441 |
+
|
| 1442 |
+
best = sweep.loc[sweep["f1"].idxmax()]
|
| 1443 |
+
print(f"\nF1-maximizing threshold: {best['threshold']} (F1={best['f1']})")
|
| 1444 |
+
print(f" refusal precision: {best['refusal_precision']}")
|
| 1445 |
+
print(f" refusal recall: {best['refusal_recall']}")
|
| 1446 |
+
|
| 1447 |
+
s = sweep.sort_values("fpr")
|
| 1448 |
+
auc = 0.0
|
| 1449 |
+
for i in range(1, len(s)):
|
| 1450 |
+
auc += (s.iloc[i]["fpr"] - s.iloc[i-1]["fpr"]) * (s.iloc[i]["tpr"] + s.iloc[i-1]["tpr"]) / 2
|
| 1451 |
+
print(f"approx ROC AUC: {auc:.3f}")
|
| 1452 |
+
return sweep, best, auc
|
| 1453 |
+
|
| 1454 |
+
|
| 1455 |
+
def plot_experiment_C():
|
| 1456 |
+
out = analyze_experiment_C()
|
| 1457 |
+
if out is None or out[0] is None:
|
| 1458 |
+
return
|
| 1459 |
+
sweep, best, auc = out
|
| 1460 |
+
fig, ax = plt.subplots(figsize=(7, 7))
|
| 1461 |
+
s = sweep.sort_values("fpr")
|
| 1462 |
+
ax.plot(s["fpr"], s["tpr"], marker="o", linewidth=2, label=f"ROC (AUC≈{auc:.3f})")
|
| 1463 |
+
ax.plot([0, 1], [0, 1], "--", color="gray", alpha=0.5, label="chance")
|
| 1464 |
+
ax.scatter([best["fpr"]], [best["tpr"]], s=200, color="red", zorder=5,
|
| 1465 |
+
label=f"best F1 @ threshold={best['threshold']}")
|
| 1466 |
+
ax.set_xlabel("False positive rate")
|
| 1467 |
+
ax.set_ylabel("True positive rate (refusal recall)")
|
| 1468 |
+
ax.set_title("Experiment C — Abstention threshold ROC")
|
| 1469 |
+
ax.set_xlim(-0.05, 1.05); ax.set_ylim(-0.05, 1.05)
|
| 1470 |
+
ax.legend(loc="lower right")
|
| 1471 |
+
ax.grid(alpha=0.3)
|
| 1472 |
+
plt.tight_layout()
|
| 1473 |
+
plt.show()
|
| 1474 |
+
|
| 1475 |
+
|
| 1476 |
+
experiment_C_df = load_or_run_experiment("experiment_C", run_experiment_C)
|
| 1477 |
+
if experiment_C_df is not None:
|
| 1478 |
+
plot_experiment_C()
|
| 1479 |
+
|
| 1480 |
+
DEFAULT_CONFIDENCE_THRESHOLD = 1.3
|
| 1481 |
+
|
| 1482 |
+
try:
|
| 1483 |
+
if experiment_C_df is not None:
|
| 1484 |
+
out = analyze_experiment_C()
|
| 1485 |
+
if out and out[1] is not None:
|
| 1486 |
+
_, best, _ = out
|
| 1487 |
+
TUNED_CONFIDENCE_THRESHOLD = float(best["threshold"])
|
| 1488 |
+
else:
|
| 1489 |
+
TUNED_CONFIDENCE_THRESHOLD = DEFAULT_CONFIDENCE_THRESHOLD
|
| 1490 |
+
else:
|
| 1491 |
+
TUNED_CONFIDENCE_THRESHOLD = DEFAULT_CONFIDENCE_THRESHOLD
|
| 1492 |
+
except Exception as e:
|
| 1493 |
+
print(f"falling back to default threshold ({e})")
|
| 1494 |
+
TUNED_CONFIDENCE_THRESHOLD = DEFAULT_CONFIDENCE_THRESHOLD
|
| 1495 |
+
|
| 1496 |
+
print(f"TUNED_CONFIDENCE_THRESHOLD = {TUNED_CONFIDENCE_THRESHOLD}")
|
| 1497 |
+
|
| 1498 |
+
"""## 18. Main pipeline (with citations + tuned threshold + semantic check)"""
|
| 1499 |
+
|
| 1500 |
+
def check_input_quality(text):
|
| 1501 |
+
if len(text.strip()) < 20:
|
| 1502 |
+
return False, "That's pretty short — try pasting a full sentence or paragraph from a paper."
|
| 1503 |
+
if len(text.strip()) > 3000:
|
| 1504 |
+
return False, "That's a lot of text. Try pasting just 1-2 paragraphs at a time."
|
| 1505 |
+
if len(text.split()) < 5:
|
| 1506 |
+
return False, "Try a longer passage — at least a full sentence from a paper."
|
| 1507 |
+
return True, "ok"
|
| 1508 |
+
|
| 1509 |
+
|
| 1510 |
+
def assess_retrieval_confidence(concept_results, paper_results, threshold=None):
|
| 1511 |
+
threshold = threshold if threshold is not None else TUNED_CONFIDENCE_THRESHOLD
|
| 1512 |
+
dists = [r["distance"] for r in (concept_results + paper_results) if r["distance"] > 0]
|
| 1513 |
+
if not dists:
|
| 1514 |
+
return "low", "I couldn't find any relevant context in my knowledge base."
|
| 1515 |
+
best = min(dists)
|
| 1516 |
+
if best < 0.8:
|
| 1517 |
+
return "high", ""
|
| 1518 |
+
elif best < threshold:
|
| 1519 |
+
return "medium", ("Note: my knowledge base has some related material, but the match isn't perfect. "
|
| 1520 |
+
"Double-check against the paper's own definitions.")
|
| 1521 |
+
else:
|
| 1522 |
+
return "low", "Heads up: the concepts in this passage don't match well with my current knowledge base."
|
| 1523 |
+
|
| 1524 |
+
|
| 1525 |
+
SCOPE_DISCLAIMER = (
|
| 1526 |
+
"---\n"
|
| 1527 |
+
"*This tool helps you understand papers; it doesn't replace them. "
|
| 1528 |
+
"Every factual sentence above is cited to a specific retrieved source. "
|
| 1529 |
+
"⚠️ marks indicate the semantic guard flagged that sentence as not fully supported by its citation. "
|
| 1530 |
+
"Always check the original paper.*"
|
| 1531 |
+
)
|
| 1532 |
+
|
| 1533 |
+
|
| 1534 |
+
def scimplify(passage, variant="hybrid"):
|
| 1535 |
+
is_ok, msg = check_input_quality(passage)
|
| 1536 |
+
if not is_ok:
|
| 1537 |
+
return msg
|
| 1538 |
+
|
| 1539 |
+
c, p = retrieve_for_variant(passage, variant)
|
| 1540 |
+
confidence, warning = assess_retrieval_confidence(c, p)
|
| 1541 |
+
result = generate_with_citation_guard(passage, c, p)
|
| 1542 |
+
|
| 1543 |
+
parts = []
|
| 1544 |
+
if result["guard_triggered"]:
|
| 1545 |
+
which = "semantic" if any("semantic" in i for i in result.get("issues", [])) else "lexical"
|
| 1546 |
+
parts.append(f"⚠️ The {which} citation guard triggered. Returning abstention rather than a potentially ungrounded answer.")
|
| 1547 |
+
if result.get("issues"):
|
| 1548 |
+
parts.append(f"\n*Reason: {'; '.join(result['issues'])}*")
|
| 1549 |
+
parts.append(f"\n{result['answer']}")
|
| 1550 |
+
parts.append(f"\n{SCOPE_DISCLAIMER}")
|
| 1551 |
+
return "\n".join(parts)
|
| 1552 |
+
|
| 1553 |
+
if result["answer"].strip() == ABSTAIN_MESSAGE or result["abstained"]:
|
| 1554 |
+
parts = [result["answer"], SCOPE_DISCLAIMER]
|
| 1555 |
+
return "\n".join(parts)
|
| 1556 |
+
|
| 1557 |
+
if confidence == "low":
|
| 1558 |
+
parts.append(f"⚠️ {warning}\n")
|
| 1559 |
+
elif confidence == "medium":
|
| 1560 |
+
parts.append(f"ℹ️ {warning}\n")
|
| 1561 |
+
|
| 1562 |
+
parts.append(result["answer"])
|
| 1563 |
+
|
| 1564 |
+
# show retrieved sources
|
| 1565 |
+
concept_names = [r["concept_name"] for r in c if "concept_name" in r]
|
| 1566 |
+
if concept_names:
|
| 1567 |
+
parts.append(f"\n\n**Retrieved concepts:** {', '.join(concept_names)}")
|
| 1568 |
+
if p:
|
| 1569 |
+
sources = sorted(set(r["source_name"] for r in p))
|
| 1570 |
+
parts.append(f"**Paper sources:** {', '.join(sources)}")
|
| 1571 |
+
|
| 1572 |
+
# surface semantic check stats if any sentences were checked
|
| 1573 |
+
findings = result.get("semantic_findings", [])
|
| 1574 |
+
if findings:
|
| 1575 |
+
n_total = len(findings)
|
| 1576 |
+
n_unsupported = sum(1 for f in findings if f["label"] in ("contradicted", "insufficient"))
|
| 1577 |
+
if n_unsupported > 0:
|
| 1578 |
+
parts.append(f"\n*Semantic guard: {n_unsupported}/{n_total} cited sentences flagged as not fully supported.*")
|
| 1579 |
+
else:
|
| 1580 |
+
parts.append(f"\n*Semantic guard: all {n_total} cited sentences supported by their citations ✓*")
|
| 1581 |
+
|
| 1582 |
+
parts.append(f"\n{SCOPE_DISCLAIMER}")
|
| 1583 |
+
return "\n".join(parts)
|
| 1584 |
+
|
| 1585 |
+
"""## 19. Gradio UI
|
| 1586 |
+
|
| 1587 |
+
"""
|
| 1588 |
+
|
| 1589 |
+
def add_pdf_to_kb(pdf_file, source_name, source_type):
|
| 1590 |
+
if pdf_file is None:
|
| 1591 |
+
return "Please upload a PDF file."
|
| 1592 |
+
if not source_name.strip():
|
| 1593 |
+
return "Please provide a name for this source."
|
| 1594 |
+
try:
|
| 1595 |
+
text = extract_text_from_pdf(pdf_file)
|
| 1596 |
+
chunks = chunk_text(text)
|
| 1597 |
+
base = source_name.strip().replace(" ", "_")
|
| 1598 |
+
ids = [f"user_{base}::c{i}" for i in range(len(chunks))]
|
| 1599 |
+
metas = [{
|
| 1600 |
+
"source_name": source_name.strip(),
|
| 1601 |
+
"source_type": source_type,
|
| 1602 |
+
"chunk_id": ids[i],
|
| 1603 |
+
} for i in range(len(chunks))]
|
| 1604 |
+
if chunks:
|
| 1605 |
+
papers_collection.add(documents=chunks, ids=ids, metadatas=metas)
|
| 1606 |
+
return f"Added {len(chunks)} chunks. Total: {papers_collection.count()}"
|
| 1607 |
+
except Exception as e:
|
| 1608 |
+
return f"Error: {e}"
|
| 1609 |
+
|
| 1610 |
+
|
| 1611 |
+
def pull_from_arxiv_ui(query, max_results):
|
| 1612 |
+
"""Gradio handler for arXiv ingestion."""
|
| 1613 |
+
try:
|
| 1614 |
+
max_results = int(max_results)
|
| 1615 |
+
if max_results < 1 or max_results > 25:
|
| 1616 |
+
return "Please pick a max_results between 1 and 25."
|
| 1617 |
+
summary = ingest_from_arxiv(query=query, max_results=max_results, verbose=False)
|
| 1618 |
+
msg = (
|
| 1619 |
+
f"✅ Ingested {summary['n_papers']} new paper(s), "
|
| 1620 |
+
f"{summary['n_chunks']} chunks. "
|
| 1621 |
+
f"Skipped {summary['n_skipped']} duplicates. "
|
| 1622 |
+
f"Total in KB: {summary['total_in_kb']} chunks."
|
| 1623 |
+
)
|
| 1624 |
+
if summary["errors"]:
|
| 1625 |
+
msg += f"\n\n⚠️ Errors: {'; '.join(summary['errors'][:3])}"
|
| 1626 |
+
return msg
|
| 1627 |
+
except Exception as e:
|
| 1628 |
+
return f"Error: {e}"
|
| 1629 |
+
|
| 1630 |
+
|
| 1631 |
+
def get_kb_status():
|
| 1632 |
+
n_concepts = concepts_collection.count()
|
| 1633 |
+
n_papers = papers_collection.count()
|
| 1634 |
+
status = f"**Concept definitions:** {n_concepts}\n\n**Paper chunks:** {n_papers}\n"
|
| 1635 |
+
if n_papers > 0:
|
| 1636 |
+
metas = papers_collection.get()["metadatas"]
|
| 1637 |
+
sources = Counter(m["source_name"] for m in metas)
|
| 1638 |
+
status += "\n**Ingested sources:**\n"
|
| 1639 |
+
for name, count in sources.most_common():
|
| 1640 |
+
status += f"- {name} — {count} chunks\n"
|
| 1641 |
+
return status
|
| 1642 |
+
|
| 1643 |
+
|
| 1644 |
+
|
| 1645 |
+
DEMO_CLEAN = "The Diels-Alder reaction is a [4+2] cycloaddition between a conjugated diene and a dienophile, producing a six-membered ring with up to four new stereocenters. The reaction proceeds through a concerted, suprafacial transition state and is highly stereospecific: cis-dienophiles yield cis-substituted cyclohexenes. Electron-withdrawing groups on the dienophile dramatically accelerate the reaction."
|
| 1646 |
+
DEMO_PAPER = "Multi-region neural population dynamics in the brain have been studied using techniques like LFADS to model the latent factors driving observed activity across regions."
|
| 1647 |
+
DEMO_ABSTAIN = "Laminated pastry dough is created by repeatedly folding butter into flour-water dough, producing alternating layers that puff up during baking as steam expands between them. Croissants are the canonical example."
|
| 1648 |
+
|
| 1649 |
+
|
| 1650 |
+
with gr.Blocks(title="Scimplify") as app:
|
| 1651 |
+
gr.Markdown("# Scimplify — NeuroAI Paper Simplifier")
|
| 1652 |
+
gr.Markdown(
|
| 1653 |
+
"Paste a NeuroAI paragraph; get a plain-language explanation with citations. "
|
| 1654 |
+
"Every factual sentence is grounded in a retrieved source. The lexical guard rejects "
|
| 1655 |
+
"invented citation IDs, and the semantic guard verifies that each cited chunk actually "
|
| 1656 |
+
"supports the claim. If neither passes, the system abstains rather than hallucinate."
|
| 1657 |
+
)
|
| 1658 |
+
|
| 1659 |
+
with gr.Tab("Explain Passage"):
|
| 1660 |
+
with gr.Row():
|
| 1661 |
+
with gr.Column(scale=1):
|
| 1662 |
+
inp = gr.Textbox(label="Passage", lines=8,
|
| 1663 |
+
placeholder="Paste a paragraph from a paper...")
|
| 1664 |
+
btn = gr.Button("Explain", variant="primary")
|
| 1665 |
+
gr.Examples(
|
| 1666 |
+
examples=[
|
| 1667 |
+
[DEMO_CLEAN],
|
| 1668 |
+
[DEMO_PAPER],
|
| 1669 |
+
[DEMO_ABSTAIN],
|
| 1670 |
+
],
|
| 1671 |
+
inputs=[inp],
|
| 1672 |
+
label="Demo passages (clean / paper-chunk / out-of-scope)",
|
| 1673 |
+
)
|
| 1674 |
+
with gr.Column(scale=2):
|
| 1675 |
+
out = gr.Markdown(label="Explanation")
|
| 1676 |
+
btn.click(fn=lambda x: scimplify(x), inputs=[inp], outputs=[out])
|
| 1677 |
+
|
| 1678 |
+
with gr.Tab("Add Papers (PDF)"):
|
| 1679 |
+
pdf_in = gr.File(label="PDF", file_types=[".pdf"])
|
| 1680 |
+
name_in = gr.Textbox(label="Source name")
|
| 1681 |
+
type_in = gr.Radio(["paper", "article", "review"], label="Type", value="paper")
|
| 1682 |
+
add_btn = gr.Button("Add to knowledge base")
|
| 1683 |
+
add_out = gr.Textbox(label="Status")
|
| 1684 |
+
add_btn.click(fn=add_pdf_to_kb, inputs=[pdf_in, name_in, type_in], outputs=[add_out])
|
| 1685 |
+
|
| 1686 |
+
with gr.Tab("Pull from arXiv"):
|
| 1687 |
+
gr.Markdown(
|
| 1688 |
+
"Fetch recent NeuroAI papers from arXiv directly. "
|
| 1689 |
+
"Skips papers already in the knowledge base (matched by arxiv_id)."
|
| 1690 |
+
)
|
| 1691 |
+
arxiv_query = gr.Textbox(
|
| 1692 |
+
label="arXiv query",
|
| 1693 |
+
value="NeuroAI",
|
| 1694 |
+
placeholder="e.g. NeuroAI, brain-inspired deep learning, neural population dynamics",
|
| 1695 |
+
)
|
| 1696 |
+
arxiv_n = gr.Slider(label="Max papers", minimum=1, maximum=20, value=5, step=1)
|
| 1697 |
+
arxiv_btn = gr.Button("Pull from arXiv", variant="primary")
|
| 1698 |
+
arxiv_out = gr.Markdown()
|
| 1699 |
+
arxiv_btn.click(fn=pull_from_arxiv_ui, inputs=[arxiv_query, arxiv_n], outputs=[arxiv_out])
|
| 1700 |
+
|
| 1701 |
+
with gr.Tab("Knowledge Base"):
|
| 1702 |
+
status_out = gr.Markdown(value=get_kb_status())
|
| 1703 |
+
refresh_btn = gr.Button("Refresh")
|
| 1704 |
+
refresh_btn.click(fn=get_kb_status, outputs=[status_out])
|
| 1705 |
+
|
| 1706 |
+
app.launch(share=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
absl-py==1.3.0
|
| 2 |
+
agate==1.6.3
|
| 3 |
+
agate-dbf==0.2.2
|
| 4 |
+
agate-excel==0.2.5
|
| 5 |
+
agate-sql==0.5.8
|
| 6 |
+
aiofiles==23.2.1
|
| 7 |
+
aiogram==2.21
|
| 8 |
+
aiohttp==3.8.1
|
| 9 |
+
aiosignal==1.2.0
|
| 10 |
+
annexremote==1.6.6
|
| 11 |
+
annotated-doc==0.0.4
|
| 12 |
+
annotated-types==0.7.0
|
| 13 |
+
ants==0.0.7
|
| 14 |
+
anyio==4.5.2
|
| 15 |
+
appdirs==1.4.4
|
| 16 |
+
appnope==0.1.2
|
| 17 |
+
argon2-cffi==21.3.0
|
| 18 |
+
argon2-cffi-bindings==21.2.0
|
| 19 |
+
arrow==1.3.0
|
| 20 |
+
arxiv==2.3.2
|
| 21 |
+
asgiref==3.8.1
|
| 22 |
+
astroid==2.4.2
|
| 23 |
+
asttokens==2.0.5
|
| 24 |
+
astunparse==1.6.3
|
| 25 |
+
async-lru==2.0.4
|
| 26 |
+
async-timeout==4.0.2
|
| 27 |
+
attrs==25.3.0
|
| 28 |
+
autopep8==1.5.4
|
| 29 |
+
babel==2.17.0
|
| 30 |
+
backcall==0.2.0
|
| 31 |
+
backoff==2.2.1
|
| 32 |
+
backports.tarfile==1.2.0
|
| 33 |
+
backports.zoneinfo==0.2.1
|
| 34 |
+
based-on-topic @ file:///Users/Marta/Desktop/miniproject/dist/based_on_topic-0.0.1-py3-none-any.whl
|
| 35 |
+
bcrypt==5.0.0
|
| 36 |
+
beautifulsoup4==4.13.5
|
| 37 |
+
black==22.1.0
|
| 38 |
+
bleach==4.1.0
|
| 39 |
+
boto3==1.37.38
|
| 40 |
+
botocore==1.37.38
|
| 41 |
+
branca==0.4.2
|
| 42 |
+
build==1.2.2.post1
|
| 43 |
+
cachetools==5.2.0
|
| 44 |
+
certifi==2022.6.15
|
| 45 |
+
cffi==1.15.0
|
| 46 |
+
chardet==4.0.0
|
| 47 |
+
charset-normalizer==2.1.0
|
| 48 |
+
chroma-hnswlib==0.7.6
|
| 49 |
+
chromadb==0.5.23
|
| 50 |
+
click==8.0.4
|
| 51 |
+
click-plugins==1.1.1
|
| 52 |
+
cligj==0.7.2
|
| 53 |
+
coloredlogs==15.0.1
|
| 54 |
+
comm==0.2.3
|
| 55 |
+
contourpy==1.0.6
|
| 56 |
+
coverage==5.5
|
| 57 |
+
csvkit==1.0.7
|
| 58 |
+
cycler==0.10.0
|
| 59 |
+
datalad==1.1.3
|
| 60 |
+
datasets==2.7.1
|
| 61 |
+
dbfread==2.0.7
|
| 62 |
+
debugpy==1.5.1
|
| 63 |
+
decorator==5.1.1
|
| 64 |
+
defusedxml==0.7.1
|
| 65 |
+
Deprecated==1.3.1
|
| 66 |
+
dill==0.3.6
|
| 67 |
+
distlib==0.3.1
|
| 68 |
+
distro==1.9.0
|
| 69 |
+
Django==4.2.29
|
| 70 |
+
durationpy==0.10
|
| 71 |
+
entrypoints==0.4
|
| 72 |
+
environs==8.0.0
|
| 73 |
+
et-xmlfile==1.1.0
|
| 74 |
+
exceptiongroup==1.3.0
|
| 75 |
+
executing==0.8.2
|
| 76 |
+
Faker==15.3.2
|
| 77 |
+
fastapi==0.124.4
|
| 78 |
+
fasteners==0.20
|
| 79 |
+
fastjsonschema==2.21.2
|
| 80 |
+
feedparser==6.0.12
|
| 81 |
+
ffmpy==0.5.0
|
| 82 |
+
filelock==3.0.12
|
| 83 |
+
Fiona==1.8.21
|
| 84 |
+
Flask==1.1.2
|
| 85 |
+
flatbuffers==22.12.6
|
| 86 |
+
folium==0.12.1
|
| 87 |
+
fonttools==4.38.0
|
| 88 |
+
fqdn==1.5.1
|
| 89 |
+
frozenlist==1.3.0
|
| 90 |
+
fsspec==2025.3.0
|
| 91 |
+
future==0.18.2
|
| 92 |
+
fuzzywuzzy==0.18.0
|
| 93 |
+
gast==0.4.0
|
| 94 |
+
geographiclib==1.50
|
| 95 |
+
geopandas==0.12.1
|
| 96 |
+
geopy==2.1.0
|
| 97 |
+
gevent==24.2.1
|
| 98 |
+
gmplot==1.4.1
|
| 99 |
+
google-auth==2.15.0
|
| 100 |
+
google-auth-oauthlib==0.4.6
|
| 101 |
+
google-pasta==0.2.0
|
| 102 |
+
googleapis-common-protos==1.73.0
|
| 103 |
+
googlemaps==4.7.3
|
| 104 |
+
gradio==4.44.1
|
| 105 |
+
gradio_client==1.3.0
|
| 106 |
+
graphlib_backport==1.1.0
|
| 107 |
+
greenlet==3.1.1
|
| 108 |
+
grpcio==1.70.0
|
| 109 |
+
h11==0.16.0
|
| 110 |
+
h5py==3.7.0
|
| 111 |
+
haversine==2.7.0
|
| 112 |
+
hf-xet==1.1.9
|
| 113 |
+
httpcore==1.0.9
|
| 114 |
+
httptools==0.6.4
|
| 115 |
+
httpx==0.28.1
|
| 116 |
+
huggingface-hub==0.34.4
|
| 117 |
+
humanfriendly==10.0
|
| 118 |
+
humanize==4.10.0
|
| 119 |
+
idna==2.10
|
| 120 |
+
importlib-resources==5.4.0
|
| 121 |
+
importlib_metadata==8.5.0
|
| 122 |
+
iniconfig==1.1.1
|
| 123 |
+
install==1.3.5
|
| 124 |
+
ipykernel==6.9.1
|
| 125 |
+
ipython==8.0.1
|
| 126 |
+
ipython-genutils==0.2.0
|
| 127 |
+
ipywidgets==8.1.7
|
| 128 |
+
iso8601==2.1.0
|
| 129 |
+
isodate==0.6.1
|
| 130 |
+
isoduration==20.11.0
|
| 131 |
+
isort==5.5.2
|
| 132 |
+
itsdangerous==1.1.0
|
| 133 |
+
jaraco.classes==3.4.0
|
| 134 |
+
jaraco.context==6.0.1
|
| 135 |
+
jaraco.functools==4.1.0
|
| 136 |
+
jedi==0.18.1
|
| 137 |
+
Jinja2==3.1.6
|
| 138 |
+
jiter==0.9.1
|
| 139 |
+
jmespath==1.0.1
|
| 140 |
+
joblib==1.1.0
|
| 141 |
+
json5==0.12.1
|
| 142 |
+
jsonpointer==3.0.0
|
| 143 |
+
jsonschema==4.23.0
|
| 144 |
+
jsonschema-specifications==2023.12.1
|
| 145 |
+
jupyter==1.0.0
|
| 146 |
+
jupyter-console==6.4.0
|
| 147 |
+
jupyter-events==0.10.0
|
| 148 |
+
jupyter-lsp==2.3.0
|
| 149 |
+
jupyter_client==7.4.9
|
| 150 |
+
jupyter_core==5.8.1
|
| 151 |
+
jupyter_server==2.14.2
|
| 152 |
+
jupyter_server_terminals==0.5.3
|
| 153 |
+
jupyterlab==4.3.8
|
| 154 |
+
jupyterlab-pygments==0.1.2
|
| 155 |
+
jupyterlab_server==2.27.3
|
| 156 |
+
jupyterlab_widgets==3.0.15
|
| 157 |
+
keyring==25.5.0
|
| 158 |
+
keyrings.alt==5.0.2
|
| 159 |
+
kiwisolver==1.3.2
|
| 160 |
+
kornia==0.7.0
|
| 161 |
+
kubernetes==35.0.0
|
| 162 |
+
lazy-object-proxy==1.4.3
|
| 163 |
+
leather==0.3.4
|
| 164 |
+
libclang==14.0.6
|
| 165 |
+
llvmlite==0.41.1
|
| 166 |
+
logging==0.4.9.6
|
| 167 |
+
looseversion==1.3.0
|
| 168 |
+
lxml==6.0.2
|
| 169 |
+
Markdown==3.4.1
|
| 170 |
+
markdown-it-py==3.0.0
|
| 171 |
+
MarkupSafe==2.1.5
|
| 172 |
+
marshmallow==3.17.0
|
| 173 |
+
matplotlib==3.6.2
|
| 174 |
+
matplotlib-inline==0.1.3
|
| 175 |
+
mccabe==0.6.1
|
| 176 |
+
mdurl==0.1.2
|
| 177 |
+
mistune==3.1.4
|
| 178 |
+
mmh3==5.0.1
|
| 179 |
+
more-itertools==10.5.0
|
| 180 |
+
mplcursors==0.5.2
|
| 181 |
+
mpmath==1.2.1
|
| 182 |
+
msgpack==1.1.1
|
| 183 |
+
multidict==6.0.2
|
| 184 |
+
multiprocess==0.70.14
|
| 185 |
+
munch==2.5.0
|
| 186 |
+
mypy-extensions==0.4.3
|
| 187 |
+
mysql-connector-python==8.0.33
|
| 188 |
+
nbclient==0.5.11
|
| 189 |
+
nbconvert==7.16.6
|
| 190 |
+
nbformat==5.10.4
|
| 191 |
+
nest-asyncio==1.5.4
|
| 192 |
+
networkx==2.8
|
| 193 |
+
nibabel==5.2.1
|
| 194 |
+
nilearn==0.10.4
|
| 195 |
+
nltk==3.6.5
|
| 196 |
+
notebook==7.3.3
|
| 197 |
+
notebook_shim==0.2.4
|
| 198 |
+
numba==0.58.1
|
| 199 |
+
numpy==1.23.5
|
| 200 |
+
oauthlib==3.2.2
|
| 201 |
+
olefile==0.46
|
| 202 |
+
onnxruntime==1.19.2
|
| 203 |
+
openai==1.109.1
|
| 204 |
+
openai-whisper==20250625
|
| 205 |
+
opencv-contrib-python==4.6.0.66
|
| 206 |
+
openpyxl==3.0.10
|
| 207 |
+
opentelemetry-api==1.33.1
|
| 208 |
+
opentelemetry-exporter-otlp-proto-common==1.33.1
|
| 209 |
+
opentelemetry-exporter-otlp-proto-grpc==1.33.1
|
| 210 |
+
opentelemetry-instrumentation==0.54b1
|
| 211 |
+
opentelemetry-instrumentation-asgi==0.54b1
|
| 212 |
+
opentelemetry-instrumentation-fastapi==0.54b1
|
| 213 |
+
opentelemetry-proto==1.33.1
|
| 214 |
+
opentelemetry-sdk==1.33.1
|
| 215 |
+
opentelemetry-semantic-conventions==0.54b1
|
| 216 |
+
opentelemetry-util-http==0.54b1
|
| 217 |
+
opt-einsum==3.3.0
|
| 218 |
+
orjson==3.10.15
|
| 219 |
+
osmnx==1.2.2
|
| 220 |
+
overrides==7.7.0
|
| 221 |
+
packaging==25.0
|
| 222 |
+
pandas==1.5.2
|
| 223 |
+
pandas-to-sql==0.0.546
|
| 224 |
+
pandasql==0.7.3
|
| 225 |
+
pandocfilters==1.5.0
|
| 226 |
+
parsedatetime==2.4
|
| 227 |
+
parso==0.8.3
|
| 228 |
+
pathspec==0.9.0
|
| 229 |
+
patool==1.12
|
| 230 |
+
pexpect==4.8.0
|
| 231 |
+
pickleshare==0.7.5
|
| 232 |
+
Pillow==8.0.1
|
| 233 |
+
pkgutil_resolve_name==1.3.10
|
| 234 |
+
platformdirs==2.5.1
|
| 235 |
+
plotly==5.11.0
|
| 236 |
+
pluggy==0.13.1
|
| 237 |
+
posthog==4.2.0
|
| 238 |
+
prettytable==2.0.0
|
| 239 |
+
prometheus-client==0.13.1
|
| 240 |
+
prompt-toolkit==3.0.28
|
| 241 |
+
protobuf==5.29.6
|
| 242 |
+
ptyprocess==0.7.0
|
| 243 |
+
PuLP==2.7.0
|
| 244 |
+
pure-eval==0.2.2
|
| 245 |
+
py==1.10.0
|
| 246 |
+
pyarrow==10.0.1
|
| 247 |
+
pyasn1==0.4.8
|
| 248 |
+
pyasn1-modules==0.2.8
|
| 249 |
+
pycodestyle==2.6.0
|
| 250 |
+
pycparser==2.21
|
| 251 |
+
pydantic==2.10.6
|
| 252 |
+
pydantic_core==2.27.2
|
| 253 |
+
pydub==0.25.1
|
| 254 |
+
Pygments==2.19.2
|
| 255 |
+
pylint==2.6.0
|
| 256 |
+
pyparsing==2.4.7
|
| 257 |
+
PyPDF2==3.0.1
|
| 258 |
+
PyPika==0.51.1
|
| 259 |
+
pyproj==3.3.1
|
| 260 |
+
pyproject_hooks==1.2.0
|
| 261 |
+
PyQt5==5.15.7
|
| 262 |
+
PyQt5-Qt5==5.15.2
|
| 263 |
+
PyQt5-sip==12.11.0
|
| 264 |
+
pyrsistent==0.18.1
|
| 265 |
+
pytest==6.2.2
|
| 266 |
+
python-dateutil==2.9.0.post0
|
| 267 |
+
python-dotenv==0.20.0
|
| 268 |
+
python-gitlab==4.13.0
|
| 269 |
+
python-json-logger==3.3.0
|
| 270 |
+
python-multipart==0.0.20
|
| 271 |
+
python-slugify==6.1.2
|
| 272 |
+
python-speech-features==0.6
|
| 273 |
+
pytimeparse==1.1.8
|
| 274 |
+
pytz==2020.4
|
| 275 |
+
PyWavelets==1.4.1
|
| 276 |
+
PyYAML==6.0
|
| 277 |
+
pyzmq==27.1.0
|
| 278 |
+
qtconsole==5.2.2
|
| 279 |
+
QtPy==2.0.1
|
| 280 |
+
rawkit==0.6.0
|
| 281 |
+
referencing==0.35.1
|
| 282 |
+
regex==2024.11.6
|
| 283 |
+
requests==2.32.4
|
| 284 |
+
requests-oauthlib==1.3.1
|
| 285 |
+
requests-toolbelt==1.0.0
|
| 286 |
+
responses==0.18.0
|
| 287 |
+
rfc3339-validator==0.1.4
|
| 288 |
+
rfc3986-validator==0.1.1
|
| 289 |
+
rich==14.3.4
|
| 290 |
+
rpds-py==0.20.1
|
| 291 |
+
rsa==4.9
|
| 292 |
+
Rtree==1.0.1
|
| 293 |
+
ruff==0.15.12
|
| 294 |
+
s3transfer==0.11.5
|
| 295 |
+
safetensors==0.5.3
|
| 296 |
+
scikit-learn==1.1.3
|
| 297 |
+
scipy==1.9.3
|
| 298 |
+
seaborn==0.12.1
|
| 299 |
+
semantic-version==2.10.0
|
| 300 |
+
Send2Trash==1.8.3
|
| 301 |
+
sentence-transformers==3.2.1
|
| 302 |
+
sgmllib3k==1.0.0
|
| 303 |
+
Shapely==1.8.2
|
| 304 |
+
shellingham==1.5.4
|
| 305 |
+
six==1.15.0
|
| 306 |
+
sklearn==0.0.post1
|
| 307 |
+
sniffio==1.3.1
|
| 308 |
+
soupsieve==2.7
|
| 309 |
+
SQLAlchemy==1.4.37
|
| 310 |
+
sqlparse==0.5.5
|
| 311 |
+
stack-data==0.2.0
|
| 312 |
+
starlette==0.44.0
|
| 313 |
+
sympy==1.11.1
|
| 314 |
+
tabulate==0.8.7
|
| 315 |
+
tenacity==9.0.0
|
| 316 |
+
termcolor==1.1.0
|
| 317 |
+
terminado==0.13.1
|
| 318 |
+
testpath==0.5.0
|
| 319 |
+
text-unidecode==1.3
|
| 320 |
+
threadpoolctl==3.1.0
|
| 321 |
+
tiktoken==0.7.0
|
| 322 |
+
tokenizers==0.20.3
|
| 323 |
+
toml==0.10.1
|
| 324 |
+
tomli==2.0.1
|
| 325 |
+
tomlkit==0.12.0
|
| 326 |
+
torch==1.13.0
|
| 327 |
+
torchvision==0.14.0
|
| 328 |
+
tornado==6.4.2
|
| 329 |
+
tqdm==4.67.3
|
| 330 |
+
traitlets==5.14.3
|
| 331 |
+
transformers==4.46.3
|
| 332 |
+
typer==0.20.1
|
| 333 |
+
types-python-dateutil==2.9.0.20241206
|
| 334 |
+
typing_extensions==4.13.2
|
| 335 |
+
uri-template==1.3.0
|
| 336 |
+
urllib3==2.2.3
|
| 337 |
+
uvicorn==0.33.0
|
| 338 |
+
uvloop==0.22.1
|
| 339 |
+
virtualenv==20.4.2
|
| 340 |
+
watchfiles==0.24.0
|
| 341 |
+
wcwidth==0.2.5
|
| 342 |
+
webcolors==24.8.0
|
| 343 |
+
webencodings==0.5.1
|
| 344 |
+
websocket-client==1.8.0
|
| 345 |
+
websockets==12.0
|
| 346 |
+
Werkzeug==1.0.1
|
| 347 |
+
widgetsnbextension==4.0.14
|
| 348 |
+
wooldridge==0.4.4
|
| 349 |
+
wordcloud==1.8.2.2
|
| 350 |
+
wrapt==1.12.1
|
| 351 |
+
xgboost==1.7.2
|
| 352 |
+
xlrd==2.0.1
|
| 353 |
+
xxhash==3.1.0
|
| 354 |
+
yarl==1.8.1
|
| 355 |
+
zipp==3.20.2
|
| 356 |
+
zope.event==5.0
|
| 357 |
+
zope.interface==7.2
|