File size: 4,543 Bytes
98a5a8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env bash
# eval_all.sh — Budget Router Evaluator Wrapper
# ==============================================
# Runs heuristic + LLM eval and saves results to outputs/.
#
# Usage:
#   chmod +x eval_all.sh
#   ./eval_all.sh                      # quick: 3 seeds, heuristic + LLM
#   ./eval_all.sh --seeds 10           # full dev set
#   ./eval_all.sh --policies heuristic # no LLM (no API needed)
#   ./eval_all.sh --tasks hard hard_multi --seeds 5
#
# Prerequisites:
#   export HF_TOKEN=<your_huggingface_token>
#   export API_BASE_URL=https://router.huggingface.co/v1  (default)
#   export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct           (default)
#   uv or pip install -e . (to install budget_router package)
#
# Outputs (in outputs/ directory):
#   eval_results_<timestamp>.json    — full per-episode grader breakdown
#   eval_summary_<timestamp>.md      — markdown table ready for README

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"

# ── Defaults ────────────────────────────────────────────────────────────────
SEEDS=3
POLICIES="heuristic llm"
TASKS="easy medium hard hard_multi"
SEED_SET="dev"
OUT_DIR="$REPO_ROOT/outputs"
EXTRA_ARGS=()

# ── Parse CLI args ──────────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
    case "$1" in
        --seeds)      SEEDS="$2";    shift 2 ;;
        --seed-set)   SEED_SET="$2"; shift 2 ;;
        --out-dir)    OUT_DIR="$2";  shift 2 ;;
        --policies)
            POLICIES=""
            shift
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                POLICIES="$POLICIES $1"; shift
            done
            ;;
        --tasks)
            TASKS=""
            shift
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                TASKS="$TASKS $1"; shift
            done
            ;;
        *) EXTRA_ARGS+=("$1"); shift ;;
    esac
done

# ── Validate environment ─────────────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════╗"
echo "║   Budget Router Evaluator                   ║"
echo "╚══════════════════════════════════════════════╝"
echo ""
echo "Config:"
echo "  Policies:  $POLICIES"
echo "  Tasks:     $TASKS"
echo "  Seeds:     $SEEDS (seed_set=$SEED_SET)"
echo "  Output:    $OUT_DIR/"
echo ""

# Check HF_TOKEN if LLM in policies
if echo "$POLICIES" | grep -q "llm"; then
    if [[ -z "${HF_TOKEN:-}" && -z "${API_KEY:-}" ]]; then
        echo "⚠️  WARNING: HF_TOKEN and API_KEY not set."
        echo "   LLM policy will be skipped. Set HF_TOKEN to enable."
        echo ""
    else
        TOKEN_PREVIEW="${HF_TOKEN:-${API_KEY:-}}"
        echo "  API key:   ${TOKEN_PREVIEW:0:8}... (${#TOKEN_PREVIEW} chars)"
        echo "  Model:     ${MODEL_NAME:-Qwen/Qwen2.5-72B-Instruct}"
        echo "  Endpoint:  ${API_BASE_URL:-https://router.huggingface.co/v1}"
        echo ""
    fi
fi

# ── Build typer args ─────────────────────────────────────────────────────────
TYPER_ARGS=(
    "--seeds" "$SEEDS"
    "--seed-set" "$SEED_SET"
    "--out-dir" "$OUT_DIR"
)

for p in $POLICIES; do
    TYPER_ARGS+=("--policies" "$p")
done

for t in $TASKS; do
    TYPER_ARGS+=("--tasks" "$t")
done

# ── Run ──────────────────────────────────────────────────────────────────────
cd "$SCRIPT_DIR"

if command -v uv &>/dev/null; then
    uv run python eval_all.py "${TYPER_ARGS[@]}" "${EXTRA_ARGS[@]+"${EXTRA_ARGS[@]}"}"
elif command -v python3 &>/dev/null; then
    python3 eval_all.py "${TYPER_ARGS[@]}" "${EXTRA_ARGS[@]+"${EXTRA_ARGS[@]}"}"
else
    echo "Error: neither uv nor python3 found." >&2
    exit 1
fi

echo ""
echo "✅ Evaluation complete. Results in $OUT_DIR/"