#!/usr/bin/env python3 from __future__ import annotations import argparse import json import os import re import shutil import subprocess import sys import textwrap import time import urllib.error import urllib.request from dataclasses import dataclass from pathlib import Path from typing import Any from PIL import Image, ImageDraw, ImageFont ROOT = Path(__file__).resolve().parents[1] APP_ROOT = ROOT / "experiment-2-cadforge" CADQUERY_ENV = APP_ROOT / "python_tools" / "cadquery_env.py" PYTHON = ROOT / ".venv" / "bin" / "python" TASK_JSON = APP_ROOT / "data" / "generated-assets" / "axial_motor_stator_12_slot" / "task.json" RESULT_ROOT = ROOT / "inference" / "results" STRICT_QWEN_CODE = ( APP_ROOT / "runs" / "cadquery-env" / "local-judge-assets-strict-grpo-stator" / "model-output" / "candidate.py" ) GPT54_CODE = ( APP_ROOT / "runs" / "cadquery-env" / "openai-gpt-5.4-axial_motor_stator_12_slot-disconnected-2026-04-25T14-06-43-357Z" / "step-2" / "candidate.py" ) BG = (247, 249, 250) INK = (25, 32, 40) MUTED = (88, 102, 116) LINE = (213, 222, 230) GOOD = (49, 132, 86) MID = (42, 112, 145) BAD = (190, 76, 67) @dataclass(frozen=True) class Candidate: key: str label: str source: str code: str def font(size: int, bold: bool = False) -> ImageFont.ImageFont: candidates = [] if bold: candidates += [ "/System/Library/Fonts/Supplemental/Arial Bold.ttf", "/Library/Fonts/Arial Bold.ttf", ] candidates += [ "/System/Library/Fonts/Supplemental/Arial.ttf", "/Library/Fonts/Arial.ttf", "/System/Library/Fonts/SFNS.ttf", ] for candidate in candidates: try: return ImageFont.truetype(candidate, size) except OSError: pass return ImageFont.load_default() FONT_TITLE = font(34, bold=True) FONT_H2 = font(22, bold=True) FONT_BODY = font(17) FONT_SMALL = font(14) def slug(value: str) -> str: return re.sub(r"[^a-zA-Z0-9_.-]+", "-", value).strip("-") or "run" def extract_code(text: str) -> str: value = re.sub(r".*?", "", text or "", flags=re.DOTALL | re.IGNORECASE).strip() match = re.search(r"```(?:python|py)?\s*\n([\s\S]*?)```", value, flags=re.IGNORECASE) value = match.group(1) if match else value return value.strip().strip("`").strip() def read_task() -> dict[str, Any]: return json.loads(TASK_JSON.read_text()) def ollama_generate(model: str, prompt: str, timeout: int, url: str) -> str: system = "\n".join( [ "You are a CadQuery CAD code generator.", "Return only executable Python code. No markdown fences.", "Use import cadquery as cq.", "Assign the final exportable object to fixture.", "Use robust CadQuery primitives: Workplane.box, Workplane.circle().extrude, Workplane.cylinder, translate, rotate, union, cut.", "Avoid unsupported CadQuery APIs, filesystem access, network access, subprocess, and CQ-editor-only helpers.", "Prefer named dimensions and helper functions so the CAD is editable.", ] ) payload = { "model": model, "stream": False, "keep_alive": "10m", "options": { "temperature": 0.15, "top_p": 0.9, "num_ctx": 8192, "num_predict": 2600, }, "messages": [ {"role": "system", "content": system}, {"role": "user", "content": prompt}, ], } request = urllib.request.Request( url.rstrip("/") + "/api/chat", data=json.dumps(payload).encode("utf-8"), headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(request, timeout=timeout) as response: data = json.loads(response.read().decode("utf-8")) except urllib.error.URLError as exc: raise RuntimeError(f"Ollama request failed: {exc}") from exc return extract_code(data.get("message", {}).get("content") or data.get("response") or "") def openai_generate(model: str, prompt: str, timeout: int) -> str: if not os.environ.get("OPENAI_API_KEY"): raise RuntimeError("OPENAI_API_KEY is not set; use --gpt-source file or set the key.") from openai import OpenAI client = OpenAI(timeout=timeout) response = client.responses.create( model=model, input=[ { "role": "system", "content": "\n".join( [ "You are a CadQuery CAD code generator.", "Return only executable Python code. No markdown fences.", "Use import cadquery as cq and assign the final object to fixture.", "Prefer named dimensions, helper functions, and robust primitives.", ] ), }, {"role": "user", "content": prompt}, ], ) return extract_code(response.output_text or "") def evaluate(candidate: Candidate, out_dir: Path, task: dict[str, Any], reward_mode: str, timeout: int) -> dict[str, Any]: code_dir = out_dir / "code" code_dir.mkdir(parents=True, exist_ok=True) code_path = code_dir / f"{candidate.key}.py" code_path.write_text(candidate.code) cmd = [ str(PYTHON if PYTHON.exists() else sys.executable), str(CADQUERY_ENV), "evaluate", "--code-file", str(code_path), "--episode-id", f"inference-{out_dir.name}", "--step-id", candidate.key, "--task-prompt", task["prompt"], "--task-spec", str(TASK_JSON), "--reward-mode", reward_mode, ] proc = subprocess.run( cmd, cwd=APP_ROOT, text=True, capture_output=True, timeout=timeout, env={ **os.environ, "PYTHONPATH": str(APP_ROOT / "python_tools"), "XDG_CACHE_HOME": str(APP_ROOT / ".cache"), }, ) if proc.returncode != 0: result = { "ok": False, "candidate": candidate.__dict__, "reward": {"total": -1.0, "build": 0.0}, "notes": [f"Evaluator process failed: {(proc.stderr or proc.stdout)[-500:]}"], "artifacts_dir": "", } else: start = proc.stdout.find("{") end = proc.stdout.rfind("}") result = json.loads(proc.stdout[start : end + 1]) result["candidate"] = candidate.__dict__ local_dir = out_dir / candidate.key if local_dir.exists(): shutil.rmtree(local_dir) local_dir.mkdir(parents=True, exist_ok=True) (local_dir / "candidate.py").write_text(candidate.code) (local_dir / "result.json").write_text(json.dumps(result, indent=2)) artifacts = Path(result.get("artifacts_dir") or "") if artifacts.exists(): for name in ["reward.json", "verifier_report.md", "candidate.stl", "candidate_normalized.stl"]: src = artifacts / name if src.exists(): shutil.copy2(src, local_dir / name) for subdir in ["renders", "masks"]: src_dir = artifacts / subdir if src_dir.exists(): shutil.copytree(src_dir, local_dir / subdir, dirs_exist_ok=True) result["local_artifacts_dir"] = str(local_dir) return result def reward_value(result: dict[str, Any], key: str) -> float: try: return float(result.get("reward", {}).get(key, 0.0)) except (TypeError, ValueError): return 0.0 def fit_image(path: Path, size: tuple[int, int]) -> Image.Image: img = Image.open(path).convert("RGB") img.thumbnail((size[0] - 20, size[1] - 20), Image.Resampling.LANCZOS) canvas = Image.new("RGB", size, (255, 255, 255)) canvas.paste(img, ((size[0] - img.width) // 2, (size[1] - img.height) // 2)) return canvas def rounded_card(draw: ImageDraw.ImageDraw, xy: tuple[int, int, int, int], fill=(255, 255, 255)) -> None: draw.rounded_rectangle(xy, radius=8, fill=fill, outline=LINE, width=2) def make_comparison_image(results: list[dict[str, Any]], out_dir: Path, task: dict[str, Any]) -> Path: w, h = 1800, 950 image = Image.new("RGB", (w, h), BG) draw = ImageDraw.Draw(image) draw.text((44, 28), "Axial Motor Stator: Base Qwen vs RL-Tuned Qwen vs GPT-5.4", fill=INK, font=FONT_TITLE) draw.text((44, 76), task["prompt"], fill=MUTED, font=FONT_BODY) panel_w, panel_h = 540, 700 for index, result in enumerate(results): x = 44 + index * 584 y = 130 candidate = result["candidate"] total = reward_value(result, "total") build = reward_value(result, "build") color = GOOD if build >= 1.0 and total >= 0.65 else MID if build >= 1.0 else BAD rounded_card(draw, (x, y, x + panel_w, y + panel_h)) draw.rectangle((x, y, x + panel_w, y + 8), fill=color) draw.text((x + 22, y + 26), candidate["label"], fill=INK, font=FONT_H2) display_source = candidate["source"] if "saved GPT-5.4 artifact" in display_source: display_source = "saved GPT-5.4 stator artifact" elif "strict build-gated GRPO" in display_source: display_source = "strict build-gated GRPO stator artifact" for line_index, line in enumerate(textwrap.wrap(display_source, width=58)[:2]): draw.text((x + 22, y + 56 + 18 * line_index), line, fill=MUTED, font=FONT_SMALL) render_path = Path(result.get("local_artifacts_dir", "")) / "renders" / "isometric.png" if render_path.exists(): image.paste(fit_image(render_path, (panel_w - 44, 390)), (x + 22, y + 92)) else: draw.text((x + 24, y + 250), "No render: build failed", fill=BAD, font=FONT_H2) metrics = [ ("Total", "total"), ("Build", "build"), ("Semantic", "semantic_parts"), ("Reference", "reference_similarity"), ("Editability", "editability"), ] yy = y + 510 for label, key in metrics: draw.text((x + 28, yy), label, fill=MUTED, font=FONT_BODY) draw.text((x + 190, yy), f"{reward_value(result, key):.3f}", fill=INK, font=FONT_BODY) yy += 34 out = out_dir / "comparison.png" image.save(out) return out def write_report(results: list[dict[str, Any]], out_dir: Path, task: dict[str, Any], comparison: Path) -> Path: lines = [ "# CADForge Inference Comparison", "", "Task: `axial_motor_stator_12_slot`", "", task["prompt"], "", "![Model comparison](comparison.png)", "", "## Summary", "", "| Model | Source | Total | Build | Semantic | Reference | Editability | Local artifacts |", "|---|---|---:|---:|---:|---:|---:|---|", ] for result in results: candidate = result["candidate"] local = Path(result.get("local_artifacts_dir", "")).relative_to(out_dir) lines.append( "| {label} | {source} | {total:.3f} | {build:.1f} | {semantic:.3f} | {reference:.3f} | {editability:.3f} | `{local}` |".format( label=candidate["label"], source=candidate["source"], total=reward_value(result, "total"), build=reward_value(result, "build"), semantic=reward_value(result, "semantic_parts"), reference=reward_value(result, "reference_similarity"), editability=reward_value(result, "editability"), local=local, ) ) lines.extend( [ "", "## Interpretation", "", "This is a single-task qualitative comparison, not a leaderboard. The useful signal is that the RL-tuned Qwen adapter produces a buildable, editable stator on the same medium-difficulty part family where a frontier model also succeeds.", "", "The base Qwen row is generated locally through Ollama when `--baseline-source ollama` is used. The fine-tuned Qwen row defaults to the saved strict-GRPO held-out stator artifact, because the local laptop does not include the full HF/PEFT stack and merged model weights. The script can still run a live HF/PEFT model on a GPU machine by replacing the code source or extending the candidate generator.", "", "The honest claim is: CADForge does not prove small Qwen beats frontier models yet. It proves that a small model can become competitive on buildable code-CAD behavior when trained inside a strict executable CAD reward environment, and that longer training plus broader reference tasks is the right next scaling path.", "", "## Reproduce", "", "```bash", ".venv/bin/python inference/compare_cadquery_models.py --baseline-source ollama", "```", "", ] ) report = out_dir / "report.md" report.write_text("\n".join(lines)) return report def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Compare base Qwen, RL-tuned Qwen, and GPT-5.4 on a CadQuery stator task.") parser.add_argument("--run-id", default="stator-qwen-vs-frontier") parser.add_argument("--baseline-source", choices=["ollama", "file"], default="ollama") parser.add_argument("--baseline-model", default="qwen3.5:9b") parser.add_argument("--baseline-code", type=Path, default=None) parser.add_argument("--finetuned-code", type=Path, default=STRICT_QWEN_CODE) parser.add_argument("--gpt-source", choices=["file", "openai"], default="file") parser.add_argument("--gpt-model", default="gpt-5.4") parser.add_argument("--gpt-code", type=Path, default=GPT54_CODE) parser.add_argument("--ollama-url", default=os.environ.get("OLLAMA_URL", "http://localhost:11434")) parser.add_argument("--reward-mode", choices=["full", "fast"], default="full") parser.add_argument("--timeout", type=int, default=240) return parser.parse_args() def main() -> None: args = parse_args() task = read_task() out_dir = RESULT_ROOT / slug(args.run_id) out_dir.mkdir(parents=True, exist_ok=True) prompt = "\n".join( [ task["prompt"], "", "Return only complete executable CadQuery Python code.", "The final model must be assigned to fixture.", ] ) candidates: list[Candidate] = [] if args.baseline_source == "ollama": started = time.time() baseline_code = ollama_generate(args.baseline_model, prompt, args.timeout, args.ollama_url) source = f"live Ollama `{args.baseline_model}` ({time.time() - started:.1f}s)" else: if not args.baseline_code: raise SystemExit("--baseline-code is required with --baseline-source file") baseline_code = extract_code(args.baseline_code.read_text()) if "stator-qwen-vs-frontier" in str(args.baseline_code): source = "saved local Ollama base-Qwen output" else: source = f"file `{args.baseline_code}`" candidates.append(Candidate("base-qwen", "Base Qwen", source, baseline_code)) candidates.append( Candidate( "rl-tuned-qwen", "RL-tuned Qwen", "strict build-gated GRPO held-out stator artifact", extract_code(args.finetuned_code.read_text()), ) ) if args.gpt_source == "openai": gpt_code = openai_generate(args.gpt_model, prompt, args.timeout) gpt_source = f"live OpenAI `{args.gpt_model}`" else: gpt_code = extract_code(args.gpt_code.read_text()) gpt_source = f"saved GPT-5.4 artifact `{args.gpt_code}`" candidates.append(Candidate("gpt-5-4", "GPT-5.4", gpt_source, gpt_code)) results = [evaluate(candidate, out_dir, task, args.reward_mode, args.timeout) for candidate in candidates] (out_dir / "results.json").write_text(json.dumps(results, indent=2)) comparison = make_comparison_image(results, out_dir, task) report = write_report(results, out_dir, task, comparison) print( json.dumps( { "report": str(report), "comparison": str(comparison), "results": [ { "model": row["candidate"]["label"], "total": reward_value(row, "total"), "build": reward_value(row, "build"), "artifacts": row.get("local_artifacts_dir"), } for row in results ], }, indent=2, ) ) if __name__ == "__main__": main()