"""Ailiance Playground โ€” bench Phase 6 scoreboard. Interactive viewer of ailiance/ailiance-bench Phase 6 results. Source of truth: bench-results/compare_base_vs_lora.md (commit 46801af). """ from __future__ import annotations import gradio as gr import pandas as pd # Phase 6 scoreboard (mirror of bench-results/compare_base_vs_lora.md). # base model: gemma-e4b-eu-kiki-base SCOREBOARD = pd.DataFrame( [ ["P1", "kicad-dsl", 0.090, 0.640, 0.090, 0.090, 0.090], ["P1", "kicad-pcb", 0.010, 0.430, 0.010, 0.010, 0.015], ["P1", "spice-sim", 0.425, 0.676, 0.176, 0.189, 0.268], ["P2", "kicad-sch-gen", 0.420, 0.220, 0.400, 0.320, 0.180], ["P3", "kicad-sch-extract", 0.308, 0.690, 0.785, 0.350, 0.000], ["P4", "kicad-erc-abs", 0.060, 0.057, 0.060, 0.060, 0.033], ["P5", "kicad-erc-delta", 0.060, 0.057, 0.060, 0.060, 0.033], ], columns=["Phase", "Task", "base", "+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"], ) ADAPTERS = ["+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"] VERDICTS = """ ### Verdicts - ๐Ÿฅ‡ **eu-kiki** โ€” generalist champion (4/7 tasks) - Peak: P1-DSL **+55 pts**, P1-PCB **+42 pts** - Hosted on `:8502` (macm1 Gemma-4 + curriculum LoRA) - ๐Ÿฅ‡ **mascarade** โ€” P3 extraction champion (**+48 pts**) - Wins narrow extraction tasks but loses generation - Hosted on Tower Ollama `:8004` - โš ๏ธ **aggro** โ€” neutral (sanity-check baseline) - โŒ **kicad9plus** โ€” catastrophic forgetting on SPICE/P2/P3 - **Use only** in permissive-KiCad-only contexts - ๐Ÿšซ **kicad-sch from-scratch** โ€” unresolved across all 4 adapters - Bottleneck: KiCad 6+ S-expr absent from pre-training corpus """ TASK_DESCRIPTIONS = { "kicad-dsl": "Generate KiCad design DSL from a natural language spec", "kicad-pcb": "Generate KiCad PCB layout description", "spice-sim": "Reason about SPICE circuit simulation behavior", "kicad-sch-gen": "Generate a full .kicad_sch file from scratch", "kicad-sch-extract": "Extract components/nets from existing .kicad_sch", "kicad-erc-abs": "Detect absolute ERC (electrical rule) violations", "kicad-erc-delta": "Compute ERC delta between schematic revisions", } def compute_delta(row: pd.Series, adapter: str) -> str: """Format adapter score with ฮ” vs base in pts.""" base = row["base"] score = row[adapter] delta = (score - base) * 100 sign = "+" if delta >= 0 else "" return f"{score:.3f} ({sign}{delta:.0f})" def styled_scoreboard() -> pd.DataFrame: """Build the display dataframe with deltas in parens.""" df = SCOREBOARD.copy() for adapter in ADAPTERS: df[adapter] = df.apply(lambda r: compute_delta(r, adapter), axis=1) df["base"] = df["base"].map(lambda v: f"{v:.3f}") return df def task_detail(task: str) -> tuple[str, pd.DataFrame]: """Drill-down for one task.""" if task is None or task not in SCOREBOARD["Task"].values: return "Pick a task above to see the per-adapter breakdown.", pd.DataFrame() row = SCOREBOARD[SCOREBOARD["Task"] == task].iloc[0] base = row["base"] rows = [] for adapter in ADAPTERS: score = row[adapter] delta = (score - base) * 100 rows.append([adapter.lstrip("+"), f"{score:.3f}", f"{delta:+.1f} pts"]) df = pd.DataFrame(rows, columns=["Adapter", "Score", "ฮ” vs base"]) description = TASK_DESCRIPTIONS.get(task, "") md = f"**{task}** โ€” {description}\n\nBase score: `{base:.3f}` (Gemma-E4B)" return md, df def best_per_task() -> pd.DataFrame: """Which adapter wins each task?""" rows = [] for _, row in SCOREBOARD.iterrows(): scores = {a: row[a] for a in ADAPTERS} winner = max(scores, key=scores.get) delta = (scores[winner] - row["base"]) * 100 rows.append( [ row["Phase"], row["Task"], winner.lstrip("+"), f"{scores[winner]:.3f}", f"{delta:+.1f} pts", ] ) return pd.DataFrame(rows, columns=["Phase", "Task", "Winner", "Score", "ฮ”"]) with gr.Blocks( title="Ailiance Playground โ€” Bench Phase 6", theme=gr.themes.Soft(), ) as demo: gr.Markdown( """ # ๐Ÿ“Š Ailiance Playground **Phase 6 bench scoreboard** โ€” 7-task hardware-design evaluation of LoRA adapters against the base Gemma-E4B model. Source: [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench#scoreboard-lora-phase-6--2026-05-11) ยท commit `46801af` """ ) with gr.Tab("Scoreboard"): gr.Markdown( "Each cell shows the adapter score and ฮ” in points (ร— 100) vs base." ) gr.Dataframe( styled_scoreboard(), interactive=False, wrap=True, ) gr.Markdown(VERDICTS) with gr.Tab("Task drill-down"): gr.Markdown("Pick a task to see per-adapter performance and ฮ”.") task_dropdown = gr.Dropdown( choices=list(SCOREBOARD["Task"]), label="Task", value="kicad-dsl", ) task_md = gr.Markdown() task_table = gr.Dataframe(interactive=False) task_dropdown.change( task_detail, inputs=task_dropdown, outputs=[task_md, task_table] ) # Initial render demo.load(task_detail, inputs=task_dropdown, outputs=[task_md, task_table]) with gr.Tab("Winners"): gr.Markdown("Best adapter per task and the gain over the base model.") gr.Dataframe(best_per_task(), interactive=False) with gr.Tab("About"): gr.Markdown( """ ## About ailiance-bench Phase 6 The bench evaluates LoRA adapters fine-tuned on hardware-design tasks against the base `gemma-e4b-eu-kiki-base` model. Phase 6 is the final ship of the 2026-05-11 benchmark cycle. **Adapters compared:** - `eu-kiki` โ€” generalist hardware adapter (curriculum LoRA on macm1) - `mascarade` โ€” domain-specialist family (Qwen3-4B base, per-domain LoRAs on Tower) - `aggro` โ€” adversarial-data baseline (sanity check) - `kicad9plus` โ€” corpus-only continual pretrain on KiCad 9+ schematics **Methodology:** see [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench) `bench-results/compare_base_vs_lora.{md,json}`. **Production impact:** the ailiance gateway (`:9300`) routes `kicad-dsl` / `kicad-pcb` to `eu-kiki` (PR #54) and 9 hardware domains to the mascarade Tower Ollama (PR #49), after this bench validated each adapter's strengths. ## Links - ๐ŸŒ [ailiance.fr](https://ailiance.fr) - ๐Ÿ’ป [github.com/ailiance](https://github.com/ailiance) - ๐Ÿ“ฆ [huggingface.co/Ailiance-fr](https://huggingface.co/Ailiance-fr) - ๐Ÿ“Š [bench source](https://github.com/ailiance/ailiance-bench) - ๐Ÿ“œ EU AI Act tags: `art-52`, `art-53`, `gpai-fine-tune` """ ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)