Spaces:

Ailiance-fr
/

playground

Sleeping

App Files Files Community

clemsail commited on 7 days ago

Commit

2f8113e

verified ·

1 Parent(s): 683f7f4

feat: initial Phase 6 scoreboard playground

Browse files

Files changed (3) hide show

README.md +22 -7
app.py +189 -0
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,13 +1,28 @@
 ---
-title: Playground
-emoji: 📚
-colorFrom: yellow
 colorTo: yellow
 sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Ailiance Playground
+emoji: 📊
+colorFrom: blue
 colorTo: yellow
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
+pinned: true
+license: apache-2.0
+tags:
+  - ailiance
+  - bench
+  - eu-ai-act
+  - hardware
+  - kicad
+short_description: Phase 6 bench scoreboard for ailiance LoRA adapters
 ---
+# Ailiance Playground
+Interactive scoreboard for the **ailiance-bench Phase 6** evaluation of
+hardware-domain LoRA adapters. 7 tasks across KiCad/SPICE/ERC, 4
+adapters compared against the base Gemma-E4B model.
+🌐 [ailiance.fr](https://ailiance.fr) · 💻 [github.com/ailiance](https://github.com/ailiance) · 📦 [huggingface.co/Ailiance-fr](https://huggingface.co/Ailiance-fr)
+Source: [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench#scoreboard-lora-phase-6--2026-05-11) commit `46801af`.

app.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""Ailiance Playground — bench Phase 6 scoreboard.
+Interactive viewer of ailiance/ailiance-bench Phase 6 results.
+Source of truth: bench-results/compare_base_vs_lora.md (commit 46801af).
+"""
+from __future__ import annotations
+import gradio as gr
+import pandas as pd
+# Phase 6 scoreboard (mirror of bench-results/compare_base_vs_lora.md).
+# base model: gemma-e4b-eu-kiki-base
+SCOREBOARD = pd.DataFrame(
+    [
+        ["P1", "kicad-dsl", 0.090, 0.640, 0.090, 0.090, 0.090],
+        ["P1", "kicad-pcb", 0.010, 0.430, 0.010, 0.010, 0.015],
+        ["P1", "spice-sim", 0.425, 0.676, 0.176, 0.189, 0.268],
+        ["P2", "kicad-sch-gen", 0.420, 0.220, 0.400, 0.320, 0.180],
+        ["P3", "kicad-sch-extract", 0.308, 0.690, 0.785, 0.350, 0.000],
+        ["P4", "kicad-erc-abs", 0.060, 0.057, 0.060, 0.060, 0.033],
+        ["P5", "kicad-erc-delta", 0.060, 0.057, 0.060, 0.060, 0.033],
+    ],
+    columns=["Phase", "Task", "base", "+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"],
+)
+ADAPTERS = ["+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"]
+VERDICTS = """
+### Verdicts
+- 🥇 **eu-kiki** — generalist champion (4/7 tasks)
+  - Peak: P1-DSL **+55 pts**, P1-PCB **+42 pts**
+  - Hosted on `:8502` (macm1 Gemma-4 + curriculum LoRA)
+- 🥇 **mascarade** — P3 extraction champion (**+48 pts**)
+  - Wins narrow extraction tasks but loses generation
+  - Hosted on Tower Ollama `:8004`
+- ⚠️ **aggro** — neutral (sanity-check baseline)
+- ❌ **kicad9plus** — catastrophic forgetting on SPICE/P2/P3
+  - **Use only** in permissive-KiCad-only contexts
+- 🚫 **kicad-sch from-scratch** — unresolved across all 4 adapters
+  - Bottleneck: KiCad 6+ S-expr absent from pre-training corpus
+"""
+TASK_DESCRIPTIONS = {
+    "kicad-dsl": "Generate KiCad design DSL from a natural language spec",
+    "kicad-pcb": "Generate KiCad PCB layout description",
+    "spice-sim": "Reason about SPICE circuit simulation behavior",
+    "kicad-sch-gen": "Generate a full .kicad_sch file from scratch",
+    "kicad-sch-extract": "Extract components/nets from existing .kicad_sch",
+    "kicad-erc-abs": "Detect absolute ERC (electrical rule) violations",
+    "kicad-erc-delta": "Compute ERC delta between schematic revisions",
+}
+def compute_delta(row: pd.Series, adapter: str) -> str:
+    """Format adapter score with Δ vs base in pts."""
+    base = row["base"]
+    score = row[adapter]
+    delta = (score - base) * 100
+    sign = "+" if delta >= 0 else ""
+    return f"{score:.3f} ({sign}{delta:.0f})"
+def styled_scoreboard() -> pd.DataFrame:
+    """Build the display dataframe with deltas in parens."""
+    df = SCOREBOARD.copy()
+    for adapter in ADAPTERS:
+        df[adapter] = df.apply(lambda r: compute_delta(r, adapter), axis=1)
+    df["base"] = df["base"].map(lambda v: f"{v:.3f}")
+    return df
+def task_detail(task: str) -> tuple[str, pd.DataFrame]:
+    """Drill-down for one task."""
+    if task is None or task not in SCOREBOARD["Task"].values:
+        return "Pick a task above to see the per-adapter breakdown.", pd.DataFrame()
+    row = SCOREBOARD[SCOREBOARD["Task"] == task].iloc[0]
+    base = row["base"]
+    rows = []
+    for adapter in ADAPTERS:
+        score = row[adapter]
+        delta = (score - base) * 100
+        rows.append([adapter.lstrip("+"), f"{score:.3f}", f"{delta:+.1f} pts"])
+    df = pd.DataFrame(rows, columns=["Adapter", "Score", "Δ vs base"])
+    description = TASK_DESCRIPTIONS.get(task, "")
+    md = f"**{task}** — {description}\n\nBase score: `{base:.3f}` (Gemma-E4B)"
+    return md, df
+def best_per_task() -> pd.DataFrame:
+    """Which adapter wins each task?"""
+    rows = []
+    for _, row in SCOREBOARD.iterrows():
+        scores = {a: row[a] for a in ADAPTERS}
+        winner = max(scores, key=scores.get)
+        delta = (scores[winner] - row["base"]) * 100
+        rows.append(
+            [
+                row["Phase"],
+                row["Task"],
+                winner.lstrip("+"),
+                f"{scores[winner]:.3f}",
+                f"{delta:+.1f} pts",
+            ]
+        )
+    return pd.DataFrame(rows, columns=["Phase", "Task", "Winner", "Score", "Δ"])
+with gr.Blocks(
+    title="Ailiance Playground — Bench Phase 6",
+    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="yellow"),
+) as demo:
+    gr.Markdown(
+        """
+# 📊 Ailiance Playground
+**Phase 6 bench scoreboard** — 7-task hardware-design evaluation of
+LoRA adapters against the base Gemma-E4B model.
+Source: [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench#scoreboard-lora-phase-6--2026-05-11) · commit `46801af`
+"""
+    )
+    with gr.Tab("Scoreboard"):
+        gr.Markdown(
+            "Each cell shows the adapter score and Δ in points (× 100) vs base."
+        )
+        gr.Dataframe(
+            styled_scoreboard(),
+            interactive=False,
+            wrap=True,
+        )
+        gr.Markdown(VERDICTS)
+    with gr.Tab("Task drill-down"):
+        gr.Markdown("Pick a task to see per-adapter performance and Δ.")
+        task_dropdown = gr.Dropdown(
+            choices=list(SCOREBOARD["Task"]),
+            label="Task",
+            value="kicad-dsl",
+        )
+        task_md = gr.Markdown()
+        task_table = gr.Dataframe(interactive=False)
+        task_dropdown.change(
+            task_detail, inputs=task_dropdown, outputs=[task_md, task_table]
+        )
+        # Initial render
+        demo.load(task_detail, inputs=task_dropdown, outputs=[task_md, task_table])
+    with gr.Tab("Winners"):
+        gr.Markdown("Best adapter per task and the gain over the base model.")
+        gr.Dataframe(best_per_task(), interactive=False)
+    with gr.Tab("About"):
+        gr.Markdown(
+            """
+## About ailiance-bench Phase 6
+The bench evaluates LoRA adapters fine-tuned on hardware-design tasks
+against the base `gemma-e4b-eu-kiki-base` model. Phase 6 is the final
+ship of the 2026-05-11 benchmark cycle.
+**Adapters compared:**
+- `eu-kiki` — generalist hardware adapter (curriculum LoRA on macm1)
+- `mascarade` — domain-specialist family (Qwen3-4B base, per-domain LoRAs on Tower)
+- `aggro` — adversarial-data baseline (sanity check)
+- `kicad9plus` — corpus-only continual pretrain on KiCad 9+ schematics
+**Methodology:** see [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench) `bench-results/compare_base_vs_lora.{md,json}`.
+**Production impact:** the ailiance gateway (`:9300`) routes `kicad-dsl` /
+`kicad-pcb` to `eu-kiki` (PR #54) and 9 hardware domains to the
+mascarade Tower Ollama (PR #49), after this bench validated each
+adapter's strengths.
+## Links
+- 🌐 [ailiance.fr](https://ailiance.fr)
+- 💻 [github.com/ailiance](https://github.com/ailiance)
+- 📦 [huggingface.co/Ailiance-fr](https://huggingface.co/Ailiance-fr)
+- 📊 [bench source](https://github.com/ailiance/ailiance-bench)
+- 📜 EU AI Act tags: `art-52`, `art-53`, `gpai-fine-tune`
+"""
+        )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio==4.44.0
2	+ pandas>=2.0