clemsail commited on
Commit
2f8113e
Β·
verified Β·
1 Parent(s): 683f7f4

feat: initial Phase 6 scoreboard playground

Browse files
Files changed (3) hide show
  1. README.md +22 -7
  2. app.py +189 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,13 +1,28 @@
1
  ---
2
- title: Playground
3
- emoji: πŸ“š
4
- colorFrom: yellow
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.14.0
8
- python_version: '3.13'
9
  app_file: app.py
10
- pinned: false
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Ailiance Playground
3
+ emoji: πŸ“Š
4
+ colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.44.0
 
8
  app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ tags:
12
+ - ailiance
13
+ - bench
14
+ - eu-ai-act
15
+ - hardware
16
+ - kicad
17
+ short_description: Phase 6 bench scoreboard for ailiance LoRA adapters
18
  ---
19
 
20
+ # Ailiance Playground
21
+
22
+ Interactive scoreboard for the **ailiance-bench Phase 6** evaluation of
23
+ hardware-domain LoRA adapters. 7 tasks across KiCad/SPICE/ERC, 4
24
+ adapters compared against the base Gemma-E4B model.
25
+
26
+ 🌐 [ailiance.fr](https://ailiance.fr) Β· πŸ’» [github.com/ailiance](https://github.com/ailiance) Β· πŸ“¦ [huggingface.co/Ailiance-fr](https://huggingface.co/Ailiance-fr)
27
+
28
+ Source: [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench#scoreboard-lora-phase-6--2026-05-11) commit `46801af`.
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ailiance Playground β€” bench Phase 6 scoreboard.
2
+
3
+ Interactive viewer of ailiance/ailiance-bench Phase 6 results.
4
+ Source of truth: bench-results/compare_base_vs_lora.md (commit 46801af).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gradio as gr
10
+ import pandas as pd
11
+
12
+ # Phase 6 scoreboard (mirror of bench-results/compare_base_vs_lora.md).
13
+ # base model: gemma-e4b-eu-kiki-base
14
+ SCOREBOARD = pd.DataFrame(
15
+ [
16
+ ["P1", "kicad-dsl", 0.090, 0.640, 0.090, 0.090, 0.090],
17
+ ["P1", "kicad-pcb", 0.010, 0.430, 0.010, 0.010, 0.015],
18
+ ["P1", "spice-sim", 0.425, 0.676, 0.176, 0.189, 0.268],
19
+ ["P2", "kicad-sch-gen", 0.420, 0.220, 0.400, 0.320, 0.180],
20
+ ["P3", "kicad-sch-extract", 0.308, 0.690, 0.785, 0.350, 0.000],
21
+ ["P4", "kicad-erc-abs", 0.060, 0.057, 0.060, 0.060, 0.033],
22
+ ["P5", "kicad-erc-delta", 0.060, 0.057, 0.060, 0.060, 0.033],
23
+ ],
24
+ columns=["Phase", "Task", "base", "+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"],
25
+ )
26
+
27
+ ADAPTERS = ["+eu-kiki", "+mascarade", "+aggro", "+kicad9plus"]
28
+
29
+ VERDICTS = """
30
+ ### Verdicts
31
+
32
+ - πŸ₯‡ **eu-kiki** β€” generalist champion (4/7 tasks)
33
+ - Peak: P1-DSL **+55 pts**, P1-PCB **+42 pts**
34
+ - Hosted on `:8502` (macm1 Gemma-4 + curriculum LoRA)
35
+ - πŸ₯‡ **mascarade** β€” P3 extraction champion (**+48 pts**)
36
+ - Wins narrow extraction tasks but loses generation
37
+ - Hosted on Tower Ollama `:8004`
38
+ - ⚠️ **aggro** β€” neutral (sanity-check baseline)
39
+ - ❌ **kicad9plus** β€” catastrophic forgetting on SPICE/P2/P3
40
+ - **Use only** in permissive-KiCad-only contexts
41
+ - 🚫 **kicad-sch from-scratch** β€” unresolved across all 4 adapters
42
+ - Bottleneck: KiCad 6+ S-expr absent from pre-training corpus
43
+ """
44
+
45
+ TASK_DESCRIPTIONS = {
46
+ "kicad-dsl": "Generate KiCad design DSL from a natural language spec",
47
+ "kicad-pcb": "Generate KiCad PCB layout description",
48
+ "spice-sim": "Reason about SPICE circuit simulation behavior",
49
+ "kicad-sch-gen": "Generate a full .kicad_sch file from scratch",
50
+ "kicad-sch-extract": "Extract components/nets from existing .kicad_sch",
51
+ "kicad-erc-abs": "Detect absolute ERC (electrical rule) violations",
52
+ "kicad-erc-delta": "Compute ERC delta between schematic revisions",
53
+ }
54
+
55
+
56
+ def compute_delta(row: pd.Series, adapter: str) -> str:
57
+ """Format adapter score with Ξ” vs base in pts."""
58
+ base = row["base"]
59
+ score = row[adapter]
60
+ delta = (score - base) * 100
61
+ sign = "+" if delta >= 0 else ""
62
+ return f"{score:.3f} ({sign}{delta:.0f})"
63
+
64
+
65
+ def styled_scoreboard() -> pd.DataFrame:
66
+ """Build the display dataframe with deltas in parens."""
67
+ df = SCOREBOARD.copy()
68
+ for adapter in ADAPTERS:
69
+ df[adapter] = df.apply(lambda r: compute_delta(r, adapter), axis=1)
70
+ df["base"] = df["base"].map(lambda v: f"{v:.3f}")
71
+ return df
72
+
73
+
74
+ def task_detail(task: str) -> tuple[str, pd.DataFrame]:
75
+ """Drill-down for one task."""
76
+ if task is None or task not in SCOREBOARD["Task"].values:
77
+ return "Pick a task above to see the per-adapter breakdown.", pd.DataFrame()
78
+ row = SCOREBOARD[SCOREBOARD["Task"] == task].iloc[0]
79
+ base = row["base"]
80
+ rows = []
81
+ for adapter in ADAPTERS:
82
+ score = row[adapter]
83
+ delta = (score - base) * 100
84
+ rows.append([adapter.lstrip("+"), f"{score:.3f}", f"{delta:+.1f} pts"])
85
+ df = pd.DataFrame(rows, columns=["Adapter", "Score", "Ξ” vs base"])
86
+ description = TASK_DESCRIPTIONS.get(task, "")
87
+ md = f"**{task}** β€” {description}\n\nBase score: `{base:.3f}` (Gemma-E4B)"
88
+ return md, df
89
+
90
+
91
+ def best_per_task() -> pd.DataFrame:
92
+ """Which adapter wins each task?"""
93
+ rows = []
94
+ for _, row in SCOREBOARD.iterrows():
95
+ scores = {a: row[a] for a in ADAPTERS}
96
+ winner = max(scores, key=scores.get)
97
+ delta = (scores[winner] - row["base"]) * 100
98
+ rows.append(
99
+ [
100
+ row["Phase"],
101
+ row["Task"],
102
+ winner.lstrip("+"),
103
+ f"{scores[winner]:.3f}",
104
+ f"{delta:+.1f} pts",
105
+ ]
106
+ )
107
+ return pd.DataFrame(rows, columns=["Phase", "Task", "Winner", "Score", "Ξ”"])
108
+
109
+
110
+ with gr.Blocks(
111
+ title="Ailiance Playground β€” Bench Phase 6",
112
+ theme=gr.themes.Soft(primary_hue="blue", secondary_hue="yellow"),
113
+ ) as demo:
114
+ gr.Markdown(
115
+ """
116
+ # πŸ“Š Ailiance Playground
117
+
118
+ **Phase 6 bench scoreboard** β€” 7-task hardware-design evaluation of
119
+ LoRA adapters against the base Gemma-E4B model.
120
+
121
+ Source: [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench#scoreboard-lora-phase-6--2026-05-11) Β· commit `46801af`
122
+ """
123
+ )
124
+
125
+ with gr.Tab("Scoreboard"):
126
+ gr.Markdown(
127
+ "Each cell shows the adapter score and Ξ” in points (Γ— 100) vs base."
128
+ )
129
+ gr.Dataframe(
130
+ styled_scoreboard(),
131
+ interactive=False,
132
+ wrap=True,
133
+ )
134
+ gr.Markdown(VERDICTS)
135
+
136
+ with gr.Tab("Task drill-down"):
137
+ gr.Markdown("Pick a task to see per-adapter performance and Ξ”.")
138
+ task_dropdown = gr.Dropdown(
139
+ choices=list(SCOREBOARD["Task"]),
140
+ label="Task",
141
+ value="kicad-dsl",
142
+ )
143
+ task_md = gr.Markdown()
144
+ task_table = gr.Dataframe(interactive=False)
145
+ task_dropdown.change(
146
+ task_detail, inputs=task_dropdown, outputs=[task_md, task_table]
147
+ )
148
+ # Initial render
149
+ demo.load(task_detail, inputs=task_dropdown, outputs=[task_md, task_table])
150
+
151
+ with gr.Tab("Winners"):
152
+ gr.Markdown("Best adapter per task and the gain over the base model.")
153
+ gr.Dataframe(best_per_task(), interactive=False)
154
+
155
+ with gr.Tab("About"):
156
+ gr.Markdown(
157
+ """
158
+ ## About ailiance-bench Phase 6
159
+
160
+ The bench evaluates LoRA adapters fine-tuned on hardware-design tasks
161
+ against the base `gemma-e4b-eu-kiki-base` model. Phase 6 is the final
162
+ ship of the 2026-05-11 benchmark cycle.
163
+
164
+ **Adapters compared:**
165
+
166
+ - `eu-kiki` β€” generalist hardware adapter (curriculum LoRA on macm1)
167
+ - `mascarade` β€” domain-specialist family (Qwen3-4B base, per-domain LoRAs on Tower)
168
+ - `aggro` β€” adversarial-data baseline (sanity check)
169
+ - `kicad9plus` β€” corpus-only continual pretrain on KiCad 9+ schematics
170
+
171
+ **Methodology:** see [`ailiance/ailiance-bench`](https://github.com/ailiance/ailiance-bench) `bench-results/compare_base_vs_lora.{md,json}`.
172
+
173
+ **Production impact:** the ailiance gateway (`:9300`) routes `kicad-dsl` /
174
+ `kicad-pcb` to `eu-kiki` (PR #54) and 9 hardware domains to the
175
+ mascarade Tower Ollama (PR #49), after this bench validated each
176
+ adapter's strengths.
177
+
178
+ ## Links
179
+
180
+ - 🌐 [ailiance.fr](https://ailiance.fr)
181
+ - πŸ’» [github.com/ailiance](https://github.com/ailiance)
182
+ - πŸ“¦ [huggingface.co/Ailiance-fr](https://huggingface.co/Ailiance-fr)
183
+ - πŸ“Š [bench source](https://github.com/ailiance/ailiance-bench)
184
+ - πŸ“œ EU AI Act tags: `art-52`, `art-53`, `gpai-fine-tune`
185
+ """
186
+ )
187
+
188
+ if __name__ == "__main__":
189
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio==4.44.0
2
+ pandas>=2.0