Spaces:
Sleeping
Sleeping
File size: 6,942 Bytes
2f8113e 9c4c551 2f8113e 9c4c551 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | """Ailiance Playground β bench Phase 6 scoreboard.
Interactive viewer of ailiance/ailiance-bench Phase 6 results.
Source of truth: bench-results/compare_base_vs_lora.md (commit 46801af).
"""
from __future__ import annotations
import gradio as gr
import pandas as pd
# Phase 6 scoreboard (mirror of bench-results/compare_base_vs_lora.md).
# base model: gemma-e4b-eu-kiki-base
SCOREBOARD = pd.DataFrame(
[
["P1", "kicad-dsl", 0.090, 0.640, 0.090, 0.090, 0.090],
["P1", "kicad-pcb", 0.010, 0.430, 0.010, 0.010, 0.015],
["P1", "spice-sim", 0.425, 0.676, 0.176, 0.189, 0.268],
["P2", "kicad-sch-gen", 0.420, 0.220, 0.400, 0.320, 0.180],
["P3", "kicad-sch-extract", 0.308, 0.690, 0.785, 0.350, 0.000],
["P4", "kicad-erc-abs", 0.060, 0.057, 0.060, 0.060, 0.033],
["P5", "kicad-erc-delta", 0.060, 0.057, 0.060, 0.060, 0.033],
],
columns=["Phase", "Task", "base", "+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"],
)
ADAPTERS = ["+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"]
VERDICTS = """
### Verdicts
- π₯ **eu-kiki** β generalist champion (4/7 tasks)
- Peak: P1-DSL **+55 pts**, P1-PCB **+42 pts**
- Hosted on `:8502` (macm1 Gemma-4 + curriculum LoRA)
- π₯ **mascarade** β P3 extraction champion (**+48 pts**)
- Wins narrow extraction tasks but loses generation
- Hosted on Tower Ollama `:8004`
- β οΈ **aggro** β neutral (sanity-check baseline)
- β **kicad9plus** β catastrophic forgetting on SPICE/P2/P3
- **Use only** in permissive-KiCad-only contexts
- π« **kicad-sch from-scratch** β unresolved across all 4 adapters
- Bottleneck: KiCad 6+ S-expr absent from pre-training corpus
"""
TASK_DESCRIPTIONS = {
"kicad-dsl": "Generate KiCad design DSL from a natural language spec",
"kicad-pcb": "Generate KiCad PCB layout description",
"spice-sim": "Reason about SPICE circuit simulation behavior",
"kicad-sch-gen": "Generate a full .kicad_sch file from scratch",
"kicad-sch-extract": "Extract components/nets from existing .kicad_sch",
"kicad-erc-abs": "Detect absolute ERC (electrical rule) violations",
"kicad-erc-delta": "Compute ERC delta between schematic revisions",
}
def compute_delta(row: pd.Series, adapter: str) -> str:
"""Format adapter score with Ξ vs base in pts."""
base = row["base"]
score = row[adapter]
delta = (score - base) * 100
sign = "+" if delta >= 0 else ""
return f"{score:.3f} ({sign}{delta:.0f})"
def styled_scoreboard() -> pd.DataFrame:
"""Build the display dataframe with deltas in parens."""
df = SCOREBOARD.copy()
for adapter in ADAPTERS:
df[adapter] = df.apply(lambda r: compute_delta(r, adapter), axis=1)
df["base"] = df["base"].map(lambda v: f"{v:.3f}")
return df
def task_detail(task: str) -> tuple[str, pd.DataFrame]:
"""Drill-down for one task."""
if task is None or task not in SCOREBOARD["Task"].values:
return "Pick a task above to see the per-adapter breakdown.", pd.DataFrame()
row = SCOREBOARD[SCOREBOARD["Task"] == task].iloc[0]
base = row["base"]
rows = []
for adapter in ADAPTERS:
score = row[adapter]
delta = (score - base) * 100
rows.append([adapter.lstrip("+"), f"{score:.3f}", f"{delta:+.1f} pts"])
df = pd.DataFrame(rows, columns=["Adapter", "Score", "Ξ vs base"])
description = TASK_DESCRIPTIONS.get(task, "")
md = f"**{task}** β {description}\n\nBase score: `{base:.3f}` (Gemma-E4B)"
return md, df
def best_per_task() -> pd.DataFrame:
"""Which adapter wins each task?"""
rows = []
for _, row in SCOREBOARD.iterrows():
scores = {a: row[a] for a in ADAPTERS}
winner = max(scores, key=scores.get)
delta = (scores[winner] - row["base"]) * 100
rows.append(
[
row["Phase"],
row["Task"],
winner.lstrip("+"),
f"{scores[winner]:.3f}",
f"{delta:+.1f} pts",
]
)
return pd.DataFrame(rows, columns=["Phase", "Task", "Winner", "Score", "Ξ"])
with gr.Blocks(
title="Ailiance Playground β Bench Phase 6",
theme=gr.themes.Soft(),
) as demo:
gr.Markdown(
"""
# π Ailiance Playground
**Phase 6 bench scoreboard** β 7-task hardware-design evaluation of
LoRA adapters against the base Gemma-E4B model.
Source: [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench#scoreboard-lora-phase-6--2026-05-11) Β· commit `46801af`
"""
)
with gr.Tab("Scoreboard"):
gr.Markdown(
"Each cell shows the adapter score and Ξ in points (Γ 100) vs base."
)
gr.Dataframe(
styled_scoreboard(),
interactive=False,
wrap=True,
)
gr.Markdown(VERDICTS)
with gr.Tab("Task drill-down"):
gr.Markdown("Pick a task to see per-adapter performance and Ξ.")
task_dropdown = gr.Dropdown(
choices=list(SCOREBOARD["Task"]),
label="Task",
value="kicad-dsl",
)
task_md = gr.Markdown()
task_table = gr.Dataframe(interactive=False)
task_dropdown.change(
task_detail, inputs=task_dropdown, outputs=[task_md, task_table]
)
# Initial render
demo.load(task_detail, inputs=task_dropdown, outputs=[task_md, task_table])
with gr.Tab("Winners"):
gr.Markdown("Best adapter per task and the gain over the base model.")
gr.Dataframe(best_per_task(), interactive=False)
with gr.Tab("About"):
gr.Markdown(
"""
## About ailiance-bench Phase 6
The bench evaluates LoRA adapters fine-tuned on hardware-design tasks
against the base `gemma-e4b-eu-kiki-base` model. Phase 6 is the final
ship of the 2026-05-11 benchmark cycle.
**Adapters compared:**
- `eu-kiki` β generalist hardware adapter (curriculum LoRA on macm1)
- `mascarade` β domain-specialist family (Qwen3-4B base, per-domain LoRAs on Tower)
- `aggro` β adversarial-data baseline (sanity check)
- `kicad9plus` β corpus-only continual pretrain on KiCad 9+ schematics
**Methodology:** see [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench) `bench-results/compare_base_vs_lora.{md,json}`.
**Production impact:** the ailiance gateway (`:9300`) routes `kicad-dsl` /
`kicad-pcb` to `eu-kiki` (PR #54) and 9 hardware domains to the
mascarade Tower Ollama (PR #49), after this bench validated each
adapter's strengths.
## Links
- π [ailiance.fr](https://ailiance.fr)
- π» [github.com/ailiance](https://github.com/ailiance)
- π¦ [huggingface.co/Ailiance-fr](https://huggingface.co/Ailiance-fr)
- π [bench source](https://github.com/ailiance/ailiance-bench)
- π EU AI Act tags: `art-52`, `art-53`, `gpai-fine-tune`
"""
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|