cadforge-cadquery-openenv / inference /compare_cadquery_models.py
sanjuhs's picture
Upload CADForge inference comparison artifacts
58415cd verified
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import os
import re
import shutil
import subprocess
import sys
import textwrap
import time
import urllib.error
import urllib.request
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from PIL import Image, ImageDraw, ImageFont
ROOT = Path(__file__).resolve().parents[1]
APP_ROOT = ROOT / "experiment-2-cadforge"
CADQUERY_ENV = APP_ROOT / "python_tools" / "cadquery_env.py"
PYTHON = ROOT / ".venv" / "bin" / "python"
TASK_JSON = APP_ROOT / "data" / "generated-assets" / "axial_motor_stator_12_slot" / "task.json"
RESULT_ROOT = ROOT / "inference" / "results"
STRICT_QWEN_CODE = (
APP_ROOT
/ "runs"
/ "cadquery-env"
/ "local-judge-assets-strict-grpo-stator"
/ "model-output"
/ "candidate.py"
)
GPT54_CODE = (
APP_ROOT
/ "runs"
/ "cadquery-env"
/ "openai-gpt-5.4-axial_motor_stator_12_slot-disconnected-2026-04-25T14-06-43-357Z"
/ "step-2"
/ "candidate.py"
)
BG = (247, 249, 250)
INK = (25, 32, 40)
MUTED = (88, 102, 116)
LINE = (213, 222, 230)
GOOD = (49, 132, 86)
MID = (42, 112, 145)
BAD = (190, 76, 67)
@dataclass(frozen=True)
class Candidate:
key: str
label: str
source: str
code: str
def font(size: int, bold: bool = False) -> ImageFont.ImageFont:
candidates = []
if bold:
candidates += [
"/System/Library/Fonts/Supplemental/Arial Bold.ttf",
"/Library/Fonts/Arial Bold.ttf",
]
candidates += [
"/System/Library/Fonts/Supplemental/Arial.ttf",
"/Library/Fonts/Arial.ttf",
"/System/Library/Fonts/SFNS.ttf",
]
for candidate in candidates:
try:
return ImageFont.truetype(candidate, size)
except OSError:
pass
return ImageFont.load_default()
FONT_TITLE = font(34, bold=True)
FONT_H2 = font(22, bold=True)
FONT_BODY = font(17)
FONT_SMALL = font(14)
def slug(value: str) -> str:
return re.sub(r"[^a-zA-Z0-9_.-]+", "-", value).strip("-") or "run"
def extract_code(text: str) -> str:
value = re.sub(r"<think>.*?</think>", "", text or "", flags=re.DOTALL | re.IGNORECASE).strip()
match = re.search(r"```(?:python|py)?\s*\n([\s\S]*?)```", value, flags=re.IGNORECASE)
value = match.group(1) if match else value
return value.strip().strip("`").strip()
def read_task() -> dict[str, Any]:
return json.loads(TASK_JSON.read_text())
def ollama_generate(model: str, prompt: str, timeout: int, url: str) -> str:
system = "\n".join(
[
"You are a CadQuery CAD code generator.",
"Return only executable Python code. No markdown fences.",
"Use import cadquery as cq.",
"Assign the final exportable object to fixture.",
"Use robust CadQuery primitives: Workplane.box, Workplane.circle().extrude, Workplane.cylinder, translate, rotate, union, cut.",
"Avoid unsupported CadQuery APIs, filesystem access, network access, subprocess, and CQ-editor-only helpers.",
"Prefer named dimensions and helper functions so the CAD is editable.",
]
)
payload = {
"model": model,
"stream": False,
"keep_alive": "10m",
"options": {
"temperature": 0.15,
"top_p": 0.9,
"num_ctx": 8192,
"num_predict": 2600,
},
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": prompt},
],
}
request = urllib.request.Request(
url.rstrip("/") + "/api/chat",
data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(request, timeout=timeout) as response:
data = json.loads(response.read().decode("utf-8"))
except urllib.error.URLError as exc:
raise RuntimeError(f"Ollama request failed: {exc}") from exc
return extract_code(data.get("message", {}).get("content") or data.get("response") or "")
def openai_generate(model: str, prompt: str, timeout: int) -> str:
if not os.environ.get("OPENAI_API_KEY"):
raise RuntimeError("OPENAI_API_KEY is not set; use --gpt-source file or set the key.")
from openai import OpenAI
client = OpenAI(timeout=timeout)
response = client.responses.create(
model=model,
input=[
{
"role": "system",
"content": "\n".join(
[
"You are a CadQuery CAD code generator.",
"Return only executable Python code. No markdown fences.",
"Use import cadquery as cq and assign the final object to fixture.",
"Prefer named dimensions, helper functions, and robust primitives.",
]
),
},
{"role": "user", "content": prompt},
],
)
return extract_code(response.output_text or "")
def evaluate(candidate: Candidate, out_dir: Path, task: dict[str, Any], reward_mode: str, timeout: int) -> dict[str, Any]:
code_dir = out_dir / "code"
code_dir.mkdir(parents=True, exist_ok=True)
code_path = code_dir / f"{candidate.key}.py"
code_path.write_text(candidate.code)
cmd = [
str(PYTHON if PYTHON.exists() else sys.executable),
str(CADQUERY_ENV),
"evaluate",
"--code-file",
str(code_path),
"--episode-id",
f"inference-{out_dir.name}",
"--step-id",
candidate.key,
"--task-prompt",
task["prompt"],
"--task-spec",
str(TASK_JSON),
"--reward-mode",
reward_mode,
]
proc = subprocess.run(
cmd,
cwd=APP_ROOT,
text=True,
capture_output=True,
timeout=timeout,
env={
**os.environ,
"PYTHONPATH": str(APP_ROOT / "python_tools"),
"XDG_CACHE_HOME": str(APP_ROOT / ".cache"),
},
)
if proc.returncode != 0:
result = {
"ok": False,
"candidate": candidate.__dict__,
"reward": {"total": -1.0, "build": 0.0},
"notes": [f"Evaluator process failed: {(proc.stderr or proc.stdout)[-500:]}"],
"artifacts_dir": "",
}
else:
start = proc.stdout.find("{")
end = proc.stdout.rfind("}")
result = json.loads(proc.stdout[start : end + 1])
result["candidate"] = candidate.__dict__
local_dir = out_dir / candidate.key
if local_dir.exists():
shutil.rmtree(local_dir)
local_dir.mkdir(parents=True, exist_ok=True)
(local_dir / "candidate.py").write_text(candidate.code)
(local_dir / "result.json").write_text(json.dumps(result, indent=2))
artifacts = Path(result.get("artifacts_dir") or "")
if artifacts.exists():
for name in ["reward.json", "verifier_report.md", "candidate.stl", "candidate_normalized.stl"]:
src = artifacts / name
if src.exists():
shutil.copy2(src, local_dir / name)
for subdir in ["renders", "masks"]:
src_dir = artifacts / subdir
if src_dir.exists():
shutil.copytree(src_dir, local_dir / subdir, dirs_exist_ok=True)
result["local_artifacts_dir"] = str(local_dir)
return result
def reward_value(result: dict[str, Any], key: str) -> float:
try:
return float(result.get("reward", {}).get(key, 0.0))
except (TypeError, ValueError):
return 0.0
def fit_image(path: Path, size: tuple[int, int]) -> Image.Image:
img = Image.open(path).convert("RGB")
img.thumbnail((size[0] - 20, size[1] - 20), Image.Resampling.LANCZOS)
canvas = Image.new("RGB", size, (255, 255, 255))
canvas.paste(img, ((size[0] - img.width) // 2, (size[1] - img.height) // 2))
return canvas
def rounded_card(draw: ImageDraw.ImageDraw, xy: tuple[int, int, int, int], fill=(255, 255, 255)) -> None:
draw.rounded_rectangle(xy, radius=8, fill=fill, outline=LINE, width=2)
def make_comparison_image(results: list[dict[str, Any]], out_dir: Path, task: dict[str, Any]) -> Path:
w, h = 1800, 950
image = Image.new("RGB", (w, h), BG)
draw = ImageDraw.Draw(image)
draw.text((44, 28), "Axial Motor Stator: Base Qwen vs RL-Tuned Qwen vs GPT-5.4", fill=INK, font=FONT_TITLE)
draw.text((44, 76), task["prompt"], fill=MUTED, font=FONT_BODY)
panel_w, panel_h = 540, 700
for index, result in enumerate(results):
x = 44 + index * 584
y = 130
candidate = result["candidate"]
total = reward_value(result, "total")
build = reward_value(result, "build")
color = GOOD if build >= 1.0 and total >= 0.65 else MID if build >= 1.0 else BAD
rounded_card(draw, (x, y, x + panel_w, y + panel_h))
draw.rectangle((x, y, x + panel_w, y + 8), fill=color)
draw.text((x + 22, y + 26), candidate["label"], fill=INK, font=FONT_H2)
display_source = candidate["source"]
if "saved GPT-5.4 artifact" in display_source:
display_source = "saved GPT-5.4 stator artifact"
elif "strict build-gated GRPO" in display_source:
display_source = "strict build-gated GRPO stator artifact"
for line_index, line in enumerate(textwrap.wrap(display_source, width=58)[:2]):
draw.text((x + 22, y + 56 + 18 * line_index), line, fill=MUTED, font=FONT_SMALL)
render_path = Path(result.get("local_artifacts_dir", "")) / "renders" / "isometric.png"
if render_path.exists():
image.paste(fit_image(render_path, (panel_w - 44, 390)), (x + 22, y + 92))
else:
draw.text((x + 24, y + 250), "No render: build failed", fill=BAD, font=FONT_H2)
metrics = [
("Total", "total"),
("Build", "build"),
("Semantic", "semantic_parts"),
("Reference", "reference_similarity"),
("Editability", "editability"),
]
yy = y + 510
for label, key in metrics:
draw.text((x + 28, yy), label, fill=MUTED, font=FONT_BODY)
draw.text((x + 190, yy), f"{reward_value(result, key):.3f}", fill=INK, font=FONT_BODY)
yy += 34
out = out_dir / "comparison.png"
image.save(out)
return out
def write_report(results: list[dict[str, Any]], out_dir: Path, task: dict[str, Any], comparison: Path) -> Path:
lines = [
"# CADForge Inference Comparison",
"",
"Task: `axial_motor_stator_12_slot`",
"",
task["prompt"],
"",
"![Model comparison](comparison.png)",
"",
"## Summary",
"",
"| Model | Source | Total | Build | Semantic | Reference | Editability | Local artifacts |",
"|---|---|---:|---:|---:|---:|---:|---|",
]
for result in results:
candidate = result["candidate"]
local = Path(result.get("local_artifacts_dir", "")).relative_to(out_dir)
lines.append(
"| {label} | {source} | {total:.3f} | {build:.1f} | {semantic:.3f} | {reference:.3f} | {editability:.3f} | `{local}` |".format(
label=candidate["label"],
source=candidate["source"],
total=reward_value(result, "total"),
build=reward_value(result, "build"),
semantic=reward_value(result, "semantic_parts"),
reference=reward_value(result, "reference_similarity"),
editability=reward_value(result, "editability"),
local=local,
)
)
lines.extend(
[
"",
"## Interpretation",
"",
"This is a single-task qualitative comparison, not a leaderboard. The useful signal is that the RL-tuned Qwen adapter produces a buildable, editable stator on the same medium-difficulty part family where a frontier model also succeeds.",
"",
"The base Qwen row is generated locally through Ollama when `--baseline-source ollama` is used. The fine-tuned Qwen row defaults to the saved strict-GRPO held-out stator artifact, because the local laptop does not include the full HF/PEFT stack and merged model weights. The script can still run a live HF/PEFT model on a GPU machine by replacing the code source or extending the candidate generator.",
"",
"The honest claim is: CADForge does not prove small Qwen beats frontier models yet. It proves that a small model can become competitive on buildable code-CAD behavior when trained inside a strict executable CAD reward environment, and that longer training plus broader reference tasks is the right next scaling path.",
"",
"## Reproduce",
"",
"```bash",
".venv/bin/python inference/compare_cadquery_models.py --baseline-source ollama",
"```",
"",
]
)
report = out_dir / "report.md"
report.write_text("\n".join(lines))
return report
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Compare base Qwen, RL-tuned Qwen, and GPT-5.4 on a CadQuery stator task.")
parser.add_argument("--run-id", default="stator-qwen-vs-frontier")
parser.add_argument("--baseline-source", choices=["ollama", "file"], default="ollama")
parser.add_argument("--baseline-model", default="qwen3.5:9b")
parser.add_argument("--baseline-code", type=Path, default=None)
parser.add_argument("--finetuned-code", type=Path, default=STRICT_QWEN_CODE)
parser.add_argument("--gpt-source", choices=["file", "openai"], default="file")
parser.add_argument("--gpt-model", default="gpt-5.4")
parser.add_argument("--gpt-code", type=Path, default=GPT54_CODE)
parser.add_argument("--ollama-url", default=os.environ.get("OLLAMA_URL", "http://localhost:11434"))
parser.add_argument("--reward-mode", choices=["full", "fast"], default="full")
parser.add_argument("--timeout", type=int, default=240)
return parser.parse_args()
def main() -> None:
args = parse_args()
task = read_task()
out_dir = RESULT_ROOT / slug(args.run_id)
out_dir.mkdir(parents=True, exist_ok=True)
prompt = "\n".join(
[
task["prompt"],
"",
"Return only complete executable CadQuery Python code.",
"The final model must be assigned to fixture.",
]
)
candidates: list[Candidate] = []
if args.baseline_source == "ollama":
started = time.time()
baseline_code = ollama_generate(args.baseline_model, prompt, args.timeout, args.ollama_url)
source = f"live Ollama `{args.baseline_model}` ({time.time() - started:.1f}s)"
else:
if not args.baseline_code:
raise SystemExit("--baseline-code is required with --baseline-source file")
baseline_code = extract_code(args.baseline_code.read_text())
if "stator-qwen-vs-frontier" in str(args.baseline_code):
source = "saved local Ollama base-Qwen output"
else:
source = f"file `{args.baseline_code}`"
candidates.append(Candidate("base-qwen", "Base Qwen", source, baseline_code))
candidates.append(
Candidate(
"rl-tuned-qwen",
"RL-tuned Qwen",
"strict build-gated GRPO held-out stator artifact",
extract_code(args.finetuned_code.read_text()),
)
)
if args.gpt_source == "openai":
gpt_code = openai_generate(args.gpt_model, prompt, args.timeout)
gpt_source = f"live OpenAI `{args.gpt_model}`"
else:
gpt_code = extract_code(args.gpt_code.read_text())
gpt_source = f"saved GPT-5.4 artifact `{args.gpt_code}`"
candidates.append(Candidate("gpt-5-4", "GPT-5.4", gpt_source, gpt_code))
results = [evaluate(candidate, out_dir, task, args.reward_mode, args.timeout) for candidate in candidates]
(out_dir / "results.json").write_text(json.dumps(results, indent=2))
comparison = make_comparison_image(results, out_dir, task)
report = write_report(results, out_dir, task, comparison)
print(
json.dumps(
{
"report": str(report),
"comparison": str(comparison),
"results": [
{
"model": row["candidate"]["label"],
"total": reward_value(row, "total"),
"build": reward_value(row, "build"),
"artifacts": row.get("local_artifacts_dir"),
}
for row in results
],
},
indent=2,
)
)
if __name__ == "__main__":
main()