Spaces:
Sleeping
Sleeping
Don Rishabh commited on
Commit ·
86be5e0
1
Parent(s): 433bfad
demo: filter tasks dead on target (verbose=0 AND trained=0)
Browse files- space-demo/app.py +18 -4
space-demo/app.py
CHANGED
|
@@ -76,15 +76,29 @@ def load_demo_rows() -> List[Dict]:
|
|
| 76 |
with urllib.request.urlopen(req) as r:
|
| 77 |
text = r.read().decode("utf-8")
|
| 78 |
rows = list(csv.DictReader(io.StringIO(text)))
|
|
|
|
| 79 |
|
| 80 |
-
def
|
| 81 |
try:
|
| 82 |
-
return float(r.get(
|
| 83 |
except ValueError:
|
| 84 |
return 0.0
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
return rows
|
| 89 |
|
| 90 |
|
|
|
|
| 76 |
with urllib.request.urlopen(req) as r:
|
| 77 |
text = r.read().decode("utf-8")
|
| 78 |
rows = list(csv.DictReader(io.StringIO(text)))
|
| 79 |
+
n_total = len(rows)
|
| 80 |
|
| 81 |
+
def _f(r: Dict, k: str) -> float:
|
| 82 |
try:
|
| 83 |
+
return float(r.get(k) or 0)
|
| 84 |
except ValueError:
|
| 85 |
return 0.0
|
| 86 |
|
| 87 |
+
# Filter out tasks that are dead on this target — both the human
|
| 88 |
+
# verbose prompt AND the trained agent's prompt score 0. Those are
|
| 89 |
+
# tasks where the target genuinely can't do the task regardless of
|
| 90 |
+
# prompt, and they just clutter the demo UI dropdown.
|
| 91 |
+
def _alive(r: Dict) -> bool:
|
| 92 |
+
return (_f(r, "verbose_accuracy") > 0
|
| 93 |
+
or _f(r, "trained_accuracy") > 0
|
| 94 |
+
or _f(r, "trained_reward") > 0)
|
| 95 |
+
|
| 96 |
+
rows = [r for r in rows if _alive(r)]
|
| 97 |
+
|
| 98 |
+
# Sort by trained reward (desc) — most interesting tasks first
|
| 99 |
+
rows.sort(key=lambda r: _f(r, "trained_reward"), reverse=True)
|
| 100 |
+
print(f"[demo] loaded {len(rows)}/{n_total} rows "
|
| 101 |
+
f"(filtered out tasks dead on this target)", flush=True)
|
| 102 |
return rows
|
| 103 |
|
| 104 |
|