Spaces:
Running
Running
| """Ailiance Playground β bench Phase 6 scoreboard. | |
| Interactive viewer of ailiance/ailiance-bench Phase 6 results. | |
| Source of truth: bench-results/compare_base_vs_lora.md (commit 46801af). | |
| """ | |
| from __future__ import annotations | |
| import gradio as gr | |
| import pandas as pd | |
| # Phase 6 scoreboard (mirror of bench-results/compare_base_vs_lora.md). | |
| # base model: gemma-e4b-eu-kiki-base | |
| SCOREBOARD = pd.DataFrame( | |
| [ | |
| ["P1", "kicad-dsl", 0.090, 0.640, 0.090, 0.090, 0.090], | |
| ["P1", "kicad-pcb", 0.010, 0.430, 0.010, 0.010, 0.015], | |
| ["P1", "spice-sim", 0.425, 0.676, 0.176, 0.189, 0.268], | |
| ["P2", "kicad-sch-gen", 0.420, 0.220, 0.400, 0.320, 0.180], | |
| ["P3", "kicad-sch-extract", 0.308, 0.690, 0.785, 0.350, 0.000], | |
| ["P4", "kicad-erc-abs", 0.060, 0.057, 0.060, 0.060, 0.033], | |
| ["P5", "kicad-erc-delta", 0.060, 0.057, 0.060, 0.060, 0.033], | |
| ], | |
| columns=["Phase", "Task", "base", "+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"], | |
| ) | |
| ADAPTERS = ["+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"] | |
| VERDICTS = """ | |
| ### Verdicts | |
| - π₯ **eu-kiki** β generalist champion (4/7 tasks) | |
| - Peak: P1-DSL **+55 pts**, P1-PCB **+42 pts** | |
| - Hosted on `:8502` (macm1 Gemma-4 + curriculum LoRA) | |
| - π₯ **mascarade** β P3 extraction champion (**+48 pts**) | |
| - Wins narrow extraction tasks but loses generation | |
| - Hosted on Tower Ollama `:8004` | |
| - β οΈ **aggro** β neutral (sanity-check baseline) | |
| - β **kicad9plus** β catastrophic forgetting on SPICE/P2/P3 | |
| - **Use only** in permissive-KiCad-only contexts | |
| - π« **kicad-sch from-scratch** β unresolved across all 4 adapters | |
| - Bottleneck: KiCad 6+ S-expr absent from pre-training corpus | |
| """ | |
| TASK_DESCRIPTIONS = { | |
| "kicad-dsl": "Generate KiCad design DSL from a natural language spec", | |
| "kicad-pcb": "Generate KiCad PCB layout description", | |
| "spice-sim": "Reason about SPICE circuit simulation behavior", | |
| "kicad-sch-gen": "Generate a full .kicad_sch file from scratch", | |
| "kicad-sch-extract": "Extract components/nets from existing .kicad_sch", | |
| "kicad-erc-abs": "Detect absolute ERC (electrical rule) violations", | |
| "kicad-erc-delta": "Compute ERC delta between schematic revisions", | |
| } | |
| def compute_delta(row: pd.Series, adapter: str) -> str: | |
| """Format adapter score with Ξ vs base in pts.""" | |
| base = row["base"] | |
| score = row[adapter] | |
| delta = (score - base) * 100 | |
| sign = "+" if delta >= 0 else "" | |
| return f"{score:.3f} ({sign}{delta:.0f})" | |
| def styled_scoreboard() -> pd.DataFrame: | |
| """Build the display dataframe with deltas in parens.""" | |
| df = SCOREBOARD.copy() | |
| for adapter in ADAPTERS: | |
| df[adapter] = df.apply(lambda r: compute_delta(r, adapter), axis=1) | |
| df["base"] = df["base"].map(lambda v: f"{v:.3f}") | |
| return df | |
| def task_detail(task: str) -> tuple[str, pd.DataFrame]: | |
| """Drill-down for one task.""" | |
| if task is None or task not in SCOREBOARD["Task"].values: | |
| return "Pick a task above to see the per-adapter breakdown.", pd.DataFrame() | |
| row = SCOREBOARD[SCOREBOARD["Task"] == task].iloc[0] | |
| base = row["base"] | |
| rows = [] | |
| for adapter in ADAPTERS: | |
| score = row[adapter] | |
| delta = (score - base) * 100 | |
| rows.append([adapter.lstrip("+"), f"{score:.3f}", f"{delta:+.1f} pts"]) | |
| df = pd.DataFrame(rows, columns=["Adapter", "Score", "Ξ vs base"]) | |
| description = TASK_DESCRIPTIONS.get(task, "") | |
| md = f"**{task}** β {description}\n\nBase score: `{base:.3f}` (Gemma-E4B)" | |
| return md, df | |
| def best_per_task() -> pd.DataFrame: | |
| """Which adapter wins each task?""" | |
| rows = [] | |
| for _, row in SCOREBOARD.iterrows(): | |
| scores = {a: row[a] for a in ADAPTERS} | |
| winner = max(scores, key=scores.get) | |
| delta = (scores[winner] - row["base"]) * 100 | |
| rows.append( | |
| [ | |
| row["Phase"], | |
| row["Task"], | |
| winner.lstrip("+"), | |
| f"{scores[winner]:.3f}", | |
| f"{delta:+.1f} pts", | |
| ] | |
| ) | |
| return pd.DataFrame(rows, columns=["Phase", "Task", "Winner", "Score", "Ξ"]) | |
| with gr.Blocks( | |
| title="Ailiance Playground β Bench Phase 6", | |
| theme=gr.themes.Soft(), | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # π Ailiance Playground | |
| **Phase 6 bench scoreboard** β 7-task hardware-design evaluation of | |
| LoRA adapters against the base Gemma-E4B model. | |
| Source: [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench#scoreboard-lora-phase-6--2026-05-11) Β· commit `46801af` | |
| """ | |
| ) | |
| with gr.Tab("Scoreboard"): | |
| gr.Markdown( | |
| "Each cell shows the adapter score and Ξ in points (Γ 100) vs base." | |
| ) | |
| gr.Dataframe( | |
| styled_scoreboard(), | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| gr.Markdown(VERDICTS) | |
| with gr.Tab("Task drill-down"): | |
| gr.Markdown("Pick a task to see per-adapter performance and Ξ.") | |
| task_dropdown = gr.Dropdown( | |
| choices=list(SCOREBOARD["Task"]), | |
| label="Task", | |
| value="kicad-dsl", | |
| ) | |
| task_md = gr.Markdown() | |
| task_table = gr.Dataframe(interactive=False) | |
| task_dropdown.change( | |
| task_detail, inputs=task_dropdown, outputs=[task_md, task_table] | |
| ) | |
| # Initial render | |
| demo.load(task_detail, inputs=task_dropdown, outputs=[task_md, task_table]) | |
| with gr.Tab("Winners"): | |
| gr.Markdown("Best adapter per task and the gain over the base model.") | |
| gr.Dataframe(best_per_task(), interactive=False) | |
| with gr.Tab("About"): | |
| gr.Markdown( | |
| """ | |
| ## About ailiance-bench Phase 6 | |
| The bench evaluates LoRA adapters fine-tuned on hardware-design tasks | |
| against the base `gemma-e4b-eu-kiki-base` model. Phase 6 is the final | |
| ship of the 2026-05-11 benchmark cycle. | |
| **Adapters compared:** | |
| - `eu-kiki` β generalist hardware adapter (curriculum LoRA on macm1) | |
| - `mascarade` β domain-specialist family (Qwen3-4B base, per-domain LoRAs on Tower) | |
| - `aggro` β adversarial-data baseline (sanity check) | |
| - `kicad9plus` β corpus-only continual pretrain on KiCad 9+ schematics | |
| **Methodology:** see [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench) `bench-results/compare_base_vs_lora.{md,json}`. | |
| **Production impact:** the ailiance gateway (`:9300`) routes `kicad-dsl` / | |
| `kicad-pcb` to `eu-kiki` (PR #54) and 9 hardware domains to the | |
| mascarade Tower Ollama (PR #49), after this bench validated each | |
| adapter's strengths. | |
| ## Links | |
| - π [ailiance.fr](https://ailiance.fr) | |
| - π» [github.com/ailiance](https://github.com/ailiance) | |
| - π¦ [huggingface.co/Ailiance-fr](https://huggingface.co/Ailiance-fr) | |
| - π [bench source](https://github.com/ailiance/ailiance-bench) | |
| - π EU AI Act tags: `art-52`, `art-53`, `gpai-fine-tune` | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |