| |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import re |
| import shutil |
| import subprocess |
| import sys |
| import textwrap |
| import time |
| import urllib.error |
| import urllib.request |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Any |
|
|
| from PIL import Image, ImageDraw, ImageFont |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| APP_ROOT = ROOT / "experiment-2-cadforge" |
| CADQUERY_ENV = APP_ROOT / "python_tools" / "cadquery_env.py" |
| PYTHON = ROOT / ".venv" / "bin" / "python" |
| TASK_JSON = APP_ROOT / "data" / "generated-assets" / "axial_motor_stator_12_slot" / "task.json" |
| RESULT_ROOT = ROOT / "inference" / "results" |
|
|
| STRICT_QWEN_CODE = ( |
| APP_ROOT |
| / "runs" |
| / "cadquery-env" |
| / "local-judge-assets-strict-grpo-stator" |
| / "model-output" |
| / "candidate.py" |
| ) |
|
|
| GPT54_CODE = ( |
| APP_ROOT |
| / "runs" |
| / "cadquery-env" |
| / "openai-gpt-5.4-axial_motor_stator_12_slot-disconnected-2026-04-25T14-06-43-357Z" |
| / "step-2" |
| / "candidate.py" |
| ) |
|
|
| BG = (247, 249, 250) |
| INK = (25, 32, 40) |
| MUTED = (88, 102, 116) |
| LINE = (213, 222, 230) |
| GOOD = (49, 132, 86) |
| MID = (42, 112, 145) |
| BAD = (190, 76, 67) |
|
|
|
|
| @dataclass(frozen=True) |
| class Candidate: |
| key: str |
| label: str |
| source: str |
| code: str |
|
|
|
|
| def font(size: int, bold: bool = False) -> ImageFont.ImageFont: |
| candidates = [] |
| if bold: |
| candidates += [ |
| "/System/Library/Fonts/Supplemental/Arial Bold.ttf", |
| "/Library/Fonts/Arial Bold.ttf", |
| ] |
| candidates += [ |
| "/System/Library/Fonts/Supplemental/Arial.ttf", |
| "/Library/Fonts/Arial.ttf", |
| "/System/Library/Fonts/SFNS.ttf", |
| ] |
| for candidate in candidates: |
| try: |
| return ImageFont.truetype(candidate, size) |
| except OSError: |
| pass |
| return ImageFont.load_default() |
|
|
|
|
| FONT_TITLE = font(34, bold=True) |
| FONT_H2 = font(22, bold=True) |
| FONT_BODY = font(17) |
| FONT_SMALL = font(14) |
|
|
|
|
| def slug(value: str) -> str: |
| return re.sub(r"[^a-zA-Z0-9_.-]+", "-", value).strip("-") or "run" |
|
|
|
|
| def extract_code(text: str) -> str: |
| value = re.sub(r"<think>.*?</think>", "", text or "", flags=re.DOTALL | re.IGNORECASE).strip() |
| match = re.search(r"```(?:python|py)?\s*\n([\s\S]*?)```", value, flags=re.IGNORECASE) |
| value = match.group(1) if match else value |
| return value.strip().strip("`").strip() |
|
|
|
|
| def read_task() -> dict[str, Any]: |
| return json.loads(TASK_JSON.read_text()) |
|
|
|
|
| def ollama_generate(model: str, prompt: str, timeout: int, url: str) -> str: |
| system = "\n".join( |
| [ |
| "You are a CadQuery CAD code generator.", |
| "Return only executable Python code. No markdown fences.", |
| "Use import cadquery as cq.", |
| "Assign the final exportable object to fixture.", |
| "Use robust CadQuery primitives: Workplane.box, Workplane.circle().extrude, Workplane.cylinder, translate, rotate, union, cut.", |
| "Avoid unsupported CadQuery APIs, filesystem access, network access, subprocess, and CQ-editor-only helpers.", |
| "Prefer named dimensions and helper functions so the CAD is editable.", |
| ] |
| ) |
| payload = { |
| "model": model, |
| "stream": False, |
| "keep_alive": "10m", |
| "options": { |
| "temperature": 0.15, |
| "top_p": 0.9, |
| "num_ctx": 8192, |
| "num_predict": 2600, |
| }, |
| "messages": [ |
| {"role": "system", "content": system}, |
| {"role": "user", "content": prompt}, |
| ], |
| } |
| request = urllib.request.Request( |
| url.rstrip("/") + "/api/chat", |
| data=json.dumps(payload).encode("utf-8"), |
| headers={"Content-Type": "application/json"}, |
| method="POST", |
| ) |
| try: |
| with urllib.request.urlopen(request, timeout=timeout) as response: |
| data = json.loads(response.read().decode("utf-8")) |
| except urllib.error.URLError as exc: |
| raise RuntimeError(f"Ollama request failed: {exc}") from exc |
| return extract_code(data.get("message", {}).get("content") or data.get("response") or "") |
|
|
|
|
| def openai_generate(model: str, prompt: str, timeout: int) -> str: |
| if not os.environ.get("OPENAI_API_KEY"): |
| raise RuntimeError("OPENAI_API_KEY is not set; use --gpt-source file or set the key.") |
| from openai import OpenAI |
|
|
| client = OpenAI(timeout=timeout) |
| response = client.responses.create( |
| model=model, |
| input=[ |
| { |
| "role": "system", |
| "content": "\n".join( |
| [ |
| "You are a CadQuery CAD code generator.", |
| "Return only executable Python code. No markdown fences.", |
| "Use import cadquery as cq and assign the final object to fixture.", |
| "Prefer named dimensions, helper functions, and robust primitives.", |
| ] |
| ), |
| }, |
| {"role": "user", "content": prompt}, |
| ], |
| ) |
| return extract_code(response.output_text or "") |
|
|
|
|
| def evaluate(candidate: Candidate, out_dir: Path, task: dict[str, Any], reward_mode: str, timeout: int) -> dict[str, Any]: |
| code_dir = out_dir / "code" |
| code_dir.mkdir(parents=True, exist_ok=True) |
| code_path = code_dir / f"{candidate.key}.py" |
| code_path.write_text(candidate.code) |
|
|
| cmd = [ |
| str(PYTHON if PYTHON.exists() else sys.executable), |
| str(CADQUERY_ENV), |
| "evaluate", |
| "--code-file", |
| str(code_path), |
| "--episode-id", |
| f"inference-{out_dir.name}", |
| "--step-id", |
| candidate.key, |
| "--task-prompt", |
| task["prompt"], |
| "--task-spec", |
| str(TASK_JSON), |
| "--reward-mode", |
| reward_mode, |
| ] |
| proc = subprocess.run( |
| cmd, |
| cwd=APP_ROOT, |
| text=True, |
| capture_output=True, |
| timeout=timeout, |
| env={ |
| **os.environ, |
| "PYTHONPATH": str(APP_ROOT / "python_tools"), |
| "XDG_CACHE_HOME": str(APP_ROOT / ".cache"), |
| }, |
| ) |
| if proc.returncode != 0: |
| result = { |
| "ok": False, |
| "candidate": candidate.__dict__, |
| "reward": {"total": -1.0, "build": 0.0}, |
| "notes": [f"Evaluator process failed: {(proc.stderr or proc.stdout)[-500:]}"], |
| "artifacts_dir": "", |
| } |
| else: |
| start = proc.stdout.find("{") |
| end = proc.stdout.rfind("}") |
| result = json.loads(proc.stdout[start : end + 1]) |
| result["candidate"] = candidate.__dict__ |
|
|
| local_dir = out_dir / candidate.key |
| if local_dir.exists(): |
| shutil.rmtree(local_dir) |
| local_dir.mkdir(parents=True, exist_ok=True) |
| (local_dir / "candidate.py").write_text(candidate.code) |
| (local_dir / "result.json").write_text(json.dumps(result, indent=2)) |
|
|
| artifacts = Path(result.get("artifacts_dir") or "") |
| if artifacts.exists(): |
| for name in ["reward.json", "verifier_report.md", "candidate.stl", "candidate_normalized.stl"]: |
| src = artifacts / name |
| if src.exists(): |
| shutil.copy2(src, local_dir / name) |
| for subdir in ["renders", "masks"]: |
| src_dir = artifacts / subdir |
| if src_dir.exists(): |
| shutil.copytree(src_dir, local_dir / subdir, dirs_exist_ok=True) |
|
|
| result["local_artifacts_dir"] = str(local_dir) |
| return result |
|
|
|
|
| def reward_value(result: dict[str, Any], key: str) -> float: |
| try: |
| return float(result.get("reward", {}).get(key, 0.0)) |
| except (TypeError, ValueError): |
| return 0.0 |
|
|
|
|
| def fit_image(path: Path, size: tuple[int, int]) -> Image.Image: |
| img = Image.open(path).convert("RGB") |
| img.thumbnail((size[0] - 20, size[1] - 20), Image.Resampling.LANCZOS) |
| canvas = Image.new("RGB", size, (255, 255, 255)) |
| canvas.paste(img, ((size[0] - img.width) // 2, (size[1] - img.height) // 2)) |
| return canvas |
|
|
|
|
| def rounded_card(draw: ImageDraw.ImageDraw, xy: tuple[int, int, int, int], fill=(255, 255, 255)) -> None: |
| draw.rounded_rectangle(xy, radius=8, fill=fill, outline=LINE, width=2) |
|
|
|
|
| def make_comparison_image(results: list[dict[str, Any]], out_dir: Path, task: dict[str, Any]) -> Path: |
| w, h = 1800, 950 |
| image = Image.new("RGB", (w, h), BG) |
| draw = ImageDraw.Draw(image) |
| draw.text((44, 28), "Axial Motor Stator: Base Qwen vs RL-Tuned Qwen vs GPT-5.4", fill=INK, font=FONT_TITLE) |
| draw.text((44, 76), task["prompt"], fill=MUTED, font=FONT_BODY) |
|
|
| panel_w, panel_h = 540, 700 |
| for index, result in enumerate(results): |
| x = 44 + index * 584 |
| y = 130 |
| candidate = result["candidate"] |
| total = reward_value(result, "total") |
| build = reward_value(result, "build") |
| color = GOOD if build >= 1.0 and total >= 0.65 else MID if build >= 1.0 else BAD |
| rounded_card(draw, (x, y, x + panel_w, y + panel_h)) |
| draw.rectangle((x, y, x + panel_w, y + 8), fill=color) |
| draw.text((x + 22, y + 26), candidate["label"], fill=INK, font=FONT_H2) |
| display_source = candidate["source"] |
| if "saved GPT-5.4 artifact" in display_source: |
| display_source = "saved GPT-5.4 stator artifact" |
| elif "strict build-gated GRPO" in display_source: |
| display_source = "strict build-gated GRPO stator artifact" |
| for line_index, line in enumerate(textwrap.wrap(display_source, width=58)[:2]): |
| draw.text((x + 22, y + 56 + 18 * line_index), line, fill=MUTED, font=FONT_SMALL) |
|
|
| render_path = Path(result.get("local_artifacts_dir", "")) / "renders" / "isometric.png" |
| if render_path.exists(): |
| image.paste(fit_image(render_path, (panel_w - 44, 390)), (x + 22, y + 92)) |
| else: |
| draw.text((x + 24, y + 250), "No render: build failed", fill=BAD, font=FONT_H2) |
|
|
| metrics = [ |
| ("Total", "total"), |
| ("Build", "build"), |
| ("Semantic", "semantic_parts"), |
| ("Reference", "reference_similarity"), |
| ("Editability", "editability"), |
| ] |
| yy = y + 510 |
| for label, key in metrics: |
| draw.text((x + 28, yy), label, fill=MUTED, font=FONT_BODY) |
| draw.text((x + 190, yy), f"{reward_value(result, key):.3f}", fill=INK, font=FONT_BODY) |
| yy += 34 |
|
|
| out = out_dir / "comparison.png" |
| image.save(out) |
| return out |
|
|
|
|
| def write_report(results: list[dict[str, Any]], out_dir: Path, task: dict[str, Any], comparison: Path) -> Path: |
| lines = [ |
| "# CADForge Inference Comparison", |
| "", |
| "Task: `axial_motor_stator_12_slot`", |
| "", |
| task["prompt"], |
| "", |
| "", |
| "", |
| "## Summary", |
| "", |
| "| Model | Source | Total | Build | Semantic | Reference | Editability | Local artifacts |", |
| "|---|---|---:|---:|---:|---:|---:|---|", |
| ] |
| for result in results: |
| candidate = result["candidate"] |
| local = Path(result.get("local_artifacts_dir", "")).relative_to(out_dir) |
| lines.append( |
| "| {label} | {source} | {total:.3f} | {build:.1f} | {semantic:.3f} | {reference:.3f} | {editability:.3f} | `{local}` |".format( |
| label=candidate["label"], |
| source=candidate["source"], |
| total=reward_value(result, "total"), |
| build=reward_value(result, "build"), |
| semantic=reward_value(result, "semantic_parts"), |
| reference=reward_value(result, "reference_similarity"), |
| editability=reward_value(result, "editability"), |
| local=local, |
| ) |
| ) |
| lines.extend( |
| [ |
| "", |
| "## Interpretation", |
| "", |
| "This is a single-task qualitative comparison, not a leaderboard. The useful signal is that the RL-tuned Qwen adapter produces a buildable, editable stator on the same medium-difficulty part family where a frontier model also succeeds.", |
| "", |
| "The base Qwen row is generated locally through Ollama when `--baseline-source ollama` is used. The fine-tuned Qwen row defaults to the saved strict-GRPO held-out stator artifact, because the local laptop does not include the full HF/PEFT stack and merged model weights. The script can still run a live HF/PEFT model on a GPU machine by replacing the code source or extending the candidate generator.", |
| "", |
| "The honest claim is: CADForge does not prove small Qwen beats frontier models yet. It proves that a small model can become competitive on buildable code-CAD behavior when trained inside a strict executable CAD reward environment, and that longer training plus broader reference tasks is the right next scaling path.", |
| "", |
| "## Reproduce", |
| "", |
| "```bash", |
| ".venv/bin/python inference/compare_cadquery_models.py --baseline-source ollama", |
| "```", |
| "", |
| ] |
| ) |
| report = out_dir / "report.md" |
| report.write_text("\n".join(lines)) |
| return report |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Compare base Qwen, RL-tuned Qwen, and GPT-5.4 on a CadQuery stator task.") |
| parser.add_argument("--run-id", default="stator-qwen-vs-frontier") |
| parser.add_argument("--baseline-source", choices=["ollama", "file"], default="ollama") |
| parser.add_argument("--baseline-model", default="qwen3.5:9b") |
| parser.add_argument("--baseline-code", type=Path, default=None) |
| parser.add_argument("--finetuned-code", type=Path, default=STRICT_QWEN_CODE) |
| parser.add_argument("--gpt-source", choices=["file", "openai"], default="file") |
| parser.add_argument("--gpt-model", default="gpt-5.4") |
| parser.add_argument("--gpt-code", type=Path, default=GPT54_CODE) |
| parser.add_argument("--ollama-url", default=os.environ.get("OLLAMA_URL", "http://localhost:11434")) |
| parser.add_argument("--reward-mode", choices=["full", "fast"], default="full") |
| parser.add_argument("--timeout", type=int, default=240) |
| return parser.parse_args() |
|
|
|
|
| def main() -> None: |
| args = parse_args() |
| task = read_task() |
| out_dir = RESULT_ROOT / slug(args.run_id) |
| out_dir.mkdir(parents=True, exist_ok=True) |
|
|
| prompt = "\n".join( |
| [ |
| task["prompt"], |
| "", |
| "Return only complete executable CadQuery Python code.", |
| "The final model must be assigned to fixture.", |
| ] |
| ) |
|
|
| candidates: list[Candidate] = [] |
| if args.baseline_source == "ollama": |
| started = time.time() |
| baseline_code = ollama_generate(args.baseline_model, prompt, args.timeout, args.ollama_url) |
| source = f"live Ollama `{args.baseline_model}` ({time.time() - started:.1f}s)" |
| else: |
| if not args.baseline_code: |
| raise SystemExit("--baseline-code is required with --baseline-source file") |
| baseline_code = extract_code(args.baseline_code.read_text()) |
| if "stator-qwen-vs-frontier" in str(args.baseline_code): |
| source = "saved local Ollama base-Qwen output" |
| else: |
| source = f"file `{args.baseline_code}`" |
| candidates.append(Candidate("base-qwen", "Base Qwen", source, baseline_code)) |
|
|
| candidates.append( |
| Candidate( |
| "rl-tuned-qwen", |
| "RL-tuned Qwen", |
| "strict build-gated GRPO held-out stator artifact", |
| extract_code(args.finetuned_code.read_text()), |
| ) |
| ) |
|
|
| if args.gpt_source == "openai": |
| gpt_code = openai_generate(args.gpt_model, prompt, args.timeout) |
| gpt_source = f"live OpenAI `{args.gpt_model}`" |
| else: |
| gpt_code = extract_code(args.gpt_code.read_text()) |
| gpt_source = f"saved GPT-5.4 artifact `{args.gpt_code}`" |
| candidates.append(Candidate("gpt-5-4", "GPT-5.4", gpt_source, gpt_code)) |
|
|
| results = [evaluate(candidate, out_dir, task, args.reward_mode, args.timeout) for candidate in candidates] |
| (out_dir / "results.json").write_text(json.dumps(results, indent=2)) |
| comparison = make_comparison_image(results, out_dir, task) |
| report = write_report(results, out_dir, task, comparison) |
|
|
| print( |
| json.dumps( |
| { |
| "report": str(report), |
| "comparison": str(comparison), |
| "results": [ |
| { |
| "model": row["candidate"]["label"], |
| "total": reward_value(row, "total"), |
| "build": reward_value(row, "build"), |
| "artifacts": row.get("local_artifacts_dir"), |
| } |
| for row in results |
| ], |
| }, |
| indent=2, |
| ) |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|