Spaces:

sanjuhs
/

cadforge-cadquery-openenv

Running

App Files Files Community

cadforge-cadquery-openenv / inference /compare_cadquery_models.py

sanjuhs

Upload CADForge inference comparison artifacts

58415cd verified 13 days ago

raw

history blame contribute delete

16.9 kB

	#!/usr/bin/env python3
	from __future__ import annotations

	import argparse
	import json
	import os
	import re
	import shutil
	import subprocess
	import sys
	import textwrap
	import time
	import urllib.error
	import urllib.request
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any

	from PIL import Image, ImageDraw, ImageFont


	ROOT = Path(__file__).resolve().parents[1]
	APP_ROOT = ROOT / "experiment-2-cadforge"
	CADQUERY_ENV = APP_ROOT / "python_tools" / "cadquery_env.py"
	PYTHON = ROOT / ".venv" / "bin" / "python"
	TASK_JSON = APP_ROOT / "data" / "generated-assets" / "axial_motor_stator_12_slot" / "task.json"
	RESULT_ROOT = ROOT / "inference" / "results"

	STRICT_QWEN_CODE = (
	APP_ROOT
	/ "runs"
	/ "cadquery-env"
	/ "local-judge-assets-strict-grpo-stator"
	/ "model-output"
	/ "candidate.py"
	)

	GPT54_CODE = (
	APP_ROOT
	/ "runs"
	/ "cadquery-env"
	/ "openai-gpt-5.4-axial_motor_stator_12_slot-disconnected-2026-04-25T14-06-43-357Z"
	/ "step-2"
	/ "candidate.py"
	)

	BG = (247, 249, 250)
	INK = (25, 32, 40)
	MUTED = (88, 102, 116)
	LINE = (213, 222, 230)
	GOOD = (49, 132, 86)
	MID = (42, 112, 145)
	BAD = (190, 76, 67)


	@dataclass(frozen=True)
	class Candidate:
	key: str
	label: str
	source: str
	code: str


	def font(size: int, bold: bool = False) -> ImageFont.ImageFont:
	candidates = []
	if bold:
	candidates += [
	"/System/Library/Fonts/Supplemental/Arial Bold.ttf",
	"/Library/Fonts/Arial Bold.ttf",
	]
	candidates += [
	"/System/Library/Fonts/Supplemental/Arial.ttf",
	"/Library/Fonts/Arial.ttf",
	"/System/Library/Fonts/SFNS.ttf",
	]
	for candidate in candidates:
	try:
	return ImageFont.truetype(candidate, size)
	except OSError:
	pass
	return ImageFont.load_default()


	FONT_TITLE = font(34, bold=True)
	FONT_H2 = font(22, bold=True)
	FONT_BODY = font(17)
	FONT_SMALL = font(14)


	def slug(value: str) -> str:
	return re.sub(r"[^a-zA-Z0-9_.-]+", "-", value).strip("-") or "run"


	def extract_code(text: str) -> str:
	value = re.sub(r"<think>.*?</think>", "", text or "", flags=re.DOTALL \| re.IGNORECASE).strip()
	match = re.search(r"```(?:python\|py)?\s\n([\s\S]?)```", value, flags=re.IGNORECASE)
	value = match.group(1) if match else value
	return value.strip().strip("`").strip()


	def read_task() -> dict[str, Any]:
	return json.loads(TASK_JSON.read_text())


	def ollama_generate(model: str, prompt: str, timeout: int, url: str) -> str:
	system = "\n".join(
	[
	"You are a CadQuery CAD code generator.",
	"Return only executable Python code. No markdown fences.",
	"Use import cadquery as cq.",
	"Assign the final exportable object to fixture.",
	"Use robust CadQuery primitives: Workplane.box, Workplane.circle().extrude, Workplane.cylinder, translate, rotate, union, cut.",
	"Avoid unsupported CadQuery APIs, filesystem access, network access, subprocess, and CQ-editor-only helpers.",
	"Prefer named dimensions and helper functions so the CAD is editable.",
	]
	)
	payload = {
	"model": model,
	"stream": False,
	"keep_alive": "10m",
	"options": {
	"temperature": 0.15,
	"top_p": 0.9,
	"num_ctx": 8192,
	"num_predict": 2600,
	},
	"messages": [
	{"role": "system", "content": system},
	{"role": "user", "content": prompt},
	],
	}
	request = urllib.request.Request(
	url.rstrip("/") + "/api/chat",
	data=json.dumps(payload).encode("utf-8"),
	headers={"Content-Type": "application/json"},
	method="POST",
	)
	try:
	with urllib.request.urlopen(request, timeout=timeout) as response:
	data = json.loads(response.read().decode("utf-8"))
	except urllib.error.URLError as exc:
	raise RuntimeError(f"Ollama request failed: {exc}") from exc
	return extract_code(data.get("message", {}).get("content") or data.get("response") or "")


	def openai_generate(model: str, prompt: str, timeout: int) -> str:
	if not os.environ.get("OPENAI_API_KEY"):
	raise RuntimeError("OPENAI_API_KEY is not set; use --gpt-source file or set the key.")
	from openai import OpenAI

	client = OpenAI(timeout=timeout)
	response = client.responses.create(
	model=model,
	input=[
	{
	"role": "system",
	"content": "\n".join(
	[
	"You are a CadQuery CAD code generator.",
	"Return only executable Python code. No markdown fences.",
	"Use import cadquery as cq and assign the final object to fixture.",
	"Prefer named dimensions, helper functions, and robust primitives.",
	]
	),
	},
	{"role": "user", "content": prompt},
	],
	)
	return extract_code(response.output_text or "")


	def evaluate(candidate: Candidate, out_dir: Path, task: dict[str, Any], reward_mode: str, timeout: int) -> dict[str, Any]:
	code_dir = out_dir / "code"
	code_dir.mkdir(parents=True, exist_ok=True)
	code_path = code_dir / f"{candidate.key}.py"
	code_path.write_text(candidate.code)

	cmd = [
	str(PYTHON if PYTHON.exists() else sys.executable),
	str(CADQUERY_ENV),
	"evaluate",
	"--code-file",
	str(code_path),
	"--episode-id",
	f"inference-{out_dir.name}",
	"--step-id",
	candidate.key,
	"--task-prompt",
	task["prompt"],
	"--task-spec",
	str(TASK_JSON),
	"--reward-mode",
	reward_mode,
	]
	proc = subprocess.run(
	cmd,
	cwd=APP_ROOT,
	text=True,
	capture_output=True,
	timeout=timeout,
	env={
	**os.environ,
	"PYTHONPATH": str(APP_ROOT / "python_tools"),
	"XDG_CACHE_HOME": str(APP_ROOT / ".cache"),
	},
	)
	if proc.returncode != 0:
	result = {
	"ok": False,
	"candidate": candidate.__dict__,
	"reward": {"total": -1.0, "build": 0.0},
	"notes": [f"Evaluator process failed: {(proc.stderr or proc.stdout)[-500:]}"],
	"artifacts_dir": "",
	}
	else:
	start = proc.stdout.find("{")
	end = proc.stdout.rfind("}")
	result = json.loads(proc.stdout[start : end + 1])
	result["candidate"] = candidate.__dict__

	local_dir = out_dir / candidate.key
	if local_dir.exists():
	shutil.rmtree(local_dir)
	local_dir.mkdir(parents=True, exist_ok=True)
	(local_dir / "candidate.py").write_text(candidate.code)
	(local_dir / "result.json").write_text(json.dumps(result, indent=2))

	artifacts = Path(result.get("artifacts_dir") or "")
	if artifacts.exists():
	for name in ["reward.json", "verifier_report.md", "candidate.stl", "candidate_normalized.stl"]:
	src = artifacts / name
	if src.exists():
	shutil.copy2(src, local_dir / name)
	for subdir in ["renders", "masks"]:
	src_dir = artifacts / subdir
	if src_dir.exists():
	shutil.copytree(src_dir, local_dir / subdir, dirs_exist_ok=True)

	result["local_artifacts_dir"] = str(local_dir)
	return result


	def reward_value(result: dict[str, Any], key: str) -> float:
	try:
	return float(result.get("reward", {}).get(key, 0.0))
	except (TypeError, ValueError):
	return 0.0


	def fit_image(path: Path, size: tuple[int, int]) -> Image.Image:
	img = Image.open(path).convert("RGB")
	img.thumbnail((size[0] - 20, size[1] - 20), Image.Resampling.LANCZOS)
	canvas = Image.new("RGB", size, (255, 255, 255))
	canvas.paste(img, ((size[0] - img.width) // 2, (size[1] - img.height) // 2))
	return canvas


	def rounded_card(draw: ImageDraw.ImageDraw, xy: tuple[int, int, int, int], fill=(255, 255, 255)) -> None:
	draw.rounded_rectangle(xy, radius=8, fill=fill, outline=LINE, width=2)


	def make_comparison_image(results: list[dict[str, Any]], out_dir: Path, task: dict[str, Any]) -> Path:
	w, h = 1800, 950
	image = Image.new("RGB", (w, h), BG)
	draw = ImageDraw.Draw(image)
	draw.text((44, 28), "Axial Motor Stator: Base Qwen vs RL-Tuned Qwen vs GPT-5.4", fill=INK, font=FONT_TITLE)
	draw.text((44, 76), task["prompt"], fill=MUTED, font=FONT_BODY)

	panel_w, panel_h = 540, 700
	for index, result in enumerate(results):
	x = 44 + index * 584
	y = 130
	candidate = result["candidate"]
	total = reward_value(result, "total")
	build = reward_value(result, "build")
	color = GOOD if build >= 1.0 and total >= 0.65 else MID if build >= 1.0 else BAD
	rounded_card(draw, (x, y, x + panel_w, y + panel_h))
	draw.rectangle((x, y, x + panel_w, y + 8), fill=color)
	draw.text((x + 22, y + 26), candidate["label"], fill=INK, font=FONT_H2)
	display_source = candidate["source"]
	if "saved GPT-5.4 artifact" in display_source:
	display_source = "saved GPT-5.4 stator artifact"
	elif "strict build-gated GRPO" in display_source:
	display_source = "strict build-gated GRPO stator artifact"
	for line_index, line in enumerate(textwrap.wrap(display_source, width=58)[:2]):
	draw.text((x + 22, y + 56 + 18 * line_index), line, fill=MUTED, font=FONT_SMALL)

	render_path = Path(result.get("local_artifacts_dir", "")) / "renders" / "isometric.png"
	if render_path.exists():
	image.paste(fit_image(render_path, (panel_w - 44, 390)), (x + 22, y + 92))
	else:
	draw.text((x + 24, y + 250), "No render: build failed", fill=BAD, font=FONT_H2)

	metrics = [
	("Total", "total"),
	("Build", "build"),
	("Semantic", "semantic_parts"),
	("Reference", "reference_similarity"),
	("Editability", "editability"),
	]
	yy = y + 510
	for label, key in metrics:
	draw.text((x + 28, yy), label, fill=MUTED, font=FONT_BODY)
	draw.text((x + 190, yy), f"{reward_value(result, key):.3f}", fill=INK, font=FONT_BODY)
	yy += 34

	out = out_dir / "comparison.png"
	image.save(out)
	return out


	def write_report(results: list[dict[str, Any]], out_dir: Path, task: dict[str, Any], comparison: Path) -> Path:
	lines = [
	"# CADForge Inference Comparison",
	"",
	"Task: `axial_motor_stator_12_slot`",
	"",
	task["prompt"],
	"",
	"![Model comparison](comparison.png)",
	"",
	"## Summary",
	"",
	"\| Model \| Source \| Total \| Build \| Semantic \| Reference \| Editability \| Local artifacts \|",
	"\|---\|---\|---:\|---:\|---:\|---:\|---:\|---\|",
	]
	for result in results:
	candidate = result["candidate"]
	local = Path(result.get("local_artifacts_dir", "")).relative_to(out_dir)
	lines.append(
	"\| {label} \| {source} \| {total:.3f} \| {build:.1f} \| {semantic:.3f} \| {reference:.3f} \| {editability:.3f} \| `{local}` \|".format(
	label=candidate["label"],
	source=candidate["source"],
	total=reward_value(result, "total"),
	build=reward_value(result, "build"),
	semantic=reward_value(result, "semantic_parts"),
	reference=reward_value(result, "reference_similarity"),
	editability=reward_value(result, "editability"),
	local=local,
	)
	)
	lines.extend(
	[
	"",
	"## Interpretation",
	"",
	"This is a single-task qualitative comparison, not a leaderboard. The useful signal is that the RL-tuned Qwen adapter produces a buildable, editable stator on the same medium-difficulty part family where a frontier model also succeeds.",
	"",
	"The base Qwen row is generated locally through Ollama when `--baseline-source ollama` is used. The fine-tuned Qwen row defaults to the saved strict-GRPO held-out stator artifact, because the local laptop does not include the full HF/PEFT stack and merged model weights. The script can still run a live HF/PEFT model on a GPU machine by replacing the code source or extending the candidate generator.",
	"",
	"The honest claim is: CADForge does not prove small Qwen beats frontier models yet. It proves that a small model can become competitive on buildable code-CAD behavior when trained inside a strict executable CAD reward environment, and that longer training plus broader reference tasks is the right next scaling path.",
	"",
	"## Reproduce",
	"",
	"```bash",
	".venv/bin/python inference/compare_cadquery_models.py --baseline-source ollama",
	"```",
	"",
	]
	)
	report = out_dir / "report.md"
	report.write_text("\n".join(lines))
	return report


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Compare base Qwen, RL-tuned Qwen, and GPT-5.4 on a CadQuery stator task.")
	parser.add_argument("--run-id", default="stator-qwen-vs-frontier")
	parser.add_argument("--baseline-source", choices=["ollama", "file"], default="ollama")
	parser.add_argument("--baseline-model", default="qwen3.5:9b")
	parser.add_argument("--baseline-code", type=Path, default=None)
	parser.add_argument("--finetuned-code", type=Path, default=STRICT_QWEN_CODE)
	parser.add_argument("--gpt-source", choices=["file", "openai"], default="file")
	parser.add_argument("--gpt-model", default="gpt-5.4")
	parser.add_argument("--gpt-code", type=Path, default=GPT54_CODE)
	parser.add_argument("--ollama-url", default=os.environ.get("OLLAMA_URL", "http://localhost:11434"))
	parser.add_argument("--reward-mode", choices=["full", "fast"], default="full")
	parser.add_argument("--timeout", type=int, default=240)
	return parser.parse_args()


	def main() -> None:
	args = parse_args()
	task = read_task()
	out_dir = RESULT_ROOT / slug(args.run_id)
	out_dir.mkdir(parents=True, exist_ok=True)

	prompt = "\n".join(
	[
	task["prompt"],
	"",
	"Return only complete executable CadQuery Python code.",
	"The final model must be assigned to fixture.",
	]
	)

	candidates: list[Candidate] = []
	if args.baseline_source == "ollama":
	started = time.time()
	baseline_code = ollama_generate(args.baseline_model, prompt, args.timeout, args.ollama_url)
	source = f"live Ollama `{args.baseline_model}` ({time.time() - started:.1f}s)"
	else:
	if not args.baseline_code:
	raise SystemExit("--baseline-code is required with --baseline-source file")
	baseline_code = extract_code(args.baseline_code.read_text())
	if "stator-qwen-vs-frontier" in str(args.baseline_code):
	source = "saved local Ollama base-Qwen output"
	else:
	source = f"file `{args.baseline_code}`"
	candidates.append(Candidate("base-qwen", "Base Qwen", source, baseline_code))

	candidates.append(
	Candidate(
	"rl-tuned-qwen",
	"RL-tuned Qwen",
	"strict build-gated GRPO held-out stator artifact",
	extract_code(args.finetuned_code.read_text()),
	)
	)

	if args.gpt_source == "openai":
	gpt_code = openai_generate(args.gpt_model, prompt, args.timeout)
	gpt_source = f"live OpenAI `{args.gpt_model}`"
	else:
	gpt_code = extract_code(args.gpt_code.read_text())
	gpt_source = f"saved GPT-5.4 artifact `{args.gpt_code}`"
	candidates.append(Candidate("gpt-5-4", "GPT-5.4", gpt_source, gpt_code))

	results = [evaluate(candidate, out_dir, task, args.reward_mode, args.timeout) for candidate in candidates]
	(out_dir / "results.json").write_text(json.dumps(results, indent=2))
	comparison = make_comparison_image(results, out_dir, task)
	report = write_report(results, out_dir, task, comparison)

	print(
	json.dumps(
	{
	"report": str(report),
	"comparison": str(comparison),
	"results": [
	{
	"model": row["candidate"]["label"],
	"total": reward_value(row, "total"),
	"build": reward_value(row, "build"),
	"artifacts": row.get("local_artifacts_dir"),
	}
	for row in results
	],
	},
	indent=2,
	)
	)


	if __name__ == "__main__":
	main()