from __future__ import annotations import json from pathlib import Path from fastapi import HTTPException, Request from fastapi.responses import FileResponse, HTMLResponse, JSONResponse from openenv.core.env_server.http_server import create_app from cadquery_env import APP_ROOT, evaluate_code, read_task_spec from .cadforge_environment import CadForgeCadQueryEnvironment from .openenv_models import CadForgeAction, CadForgeObservation app = create_app( CadForgeCadQueryEnvironment, CadForgeAction, CadForgeObservation, env_name="cadforge_cadquery", max_concurrent_envs=4, ) DEMO_TASKS = { "caster_wheel_fork": { "label": "Caster wheel fork", "prompt": "Design a small caster wheel assembly as editable code-CAD. Include a wheel, axle, U-shaped fork, swivel stem, and top mounting plate with four holes.", "broken_code": r''' import cadquery as cq # Weak seed: buildable blank plate with almost no requested assembly detail. plate = cq.Workplane("XY").box(44, 44, 4).translate((0, 0, 2)) fixture = plate.clean() '''.strip(), "code": r''' import cadquery as cq plate_w = 44.0 plate_d = 44.0 plate_t = 4.0 stem_d = 10.0 stem_h = 16.0 fork_inside_w = 22.0 fork_leg_t = 4.0 fork_leg_depth = 18.0 fork_leg_h = 24.0 wheel_d = 22.0 wheel_w = 10.0 axle_d = 4.2 def make_top_plate(): plate = cq.Workplane("XY").box(plate_w, plate_d, plate_t).translate((0, 0, plate_t / 2)) holes = ( cq.Workplane("XY") .pushPoints([(-14, -14), (-14, 14), (14, -14), (14, 14)]) .circle(2.1) .extrude(plate_t + 2) .translate((0, 0, -1)) ) return plate.cut(holes) def make_fork(): left_x = -(fork_inside_w / 2 + fork_leg_t / 2) right_x = fork_inside_w / 2 + fork_leg_t / 2 left_leg = cq.Workplane("XY").box(fork_leg_t, fork_leg_depth, fork_leg_h).translate((left_x, 0, plate_t + fork_leg_h / 2 - 1)) right_leg = cq.Workplane("XY").box(fork_leg_t, fork_leg_depth, fork_leg_h).translate((right_x, 0, plate_t + fork_leg_h / 2 - 1)) bridge = cq.Workplane("XY").box(fork_inside_w + 2 * fork_leg_t, fork_leg_depth, 4).translate((0, 0, plate_t + 1.5)) return left_leg.union(right_leg).union(bridge) plate = make_top_plate() stem = cq.Workplane("XY").cylinder(stem_h, stem_d / 2).translate((0, 0, plate_t + stem_h / 2 - 0.5)) fork = make_fork() wheel = cq.Workplane("XY").cylinder(wheel_w, wheel_d / 2).cut(cq.Workplane("XY").cylinder(wheel_w + 2, 2.5)).translate((0, 0, 18)) axle = cq.Workplane("XY").cylinder(fork_inside_w + 3, axle_d / 2).translate((0, 0, 18)) fixture = plate.union(stem).union(fork).union(wheel).union(axle).clean() '''.strip(), }, "axial_motor_stator_12_slot": { "label": "12-slot motor stator", "prompt": "Design a simple 12-slot axial motor stator concept with a circular ring, radial teeth, and center shaft opening.", "broken_code": r''' import cadquery as cq # Weak seed: buildable disk with a center bore, but no teeth or slot structure. outer_radius = 60.0 shaft_radius = 12.0 thickness = 8.0 disk = cq.Workplane("XY").circle(outer_radius).extrude(thickness) bore = cq.Workplane("XY").circle(shaft_radius).extrude(thickness + 2).translate((0, 0, -1)) fixture = disk.cut(bore).clean() '''.strip(), "code": r''' import cadquery as cq # Editable axial motor stator concept with twelve radial teeth and center bore. stator_slot_count = 12 stator_outer_radius = 60.0 stator_inner_radius = 24.0 stator_shaft_radius = 11.0 stator_thickness = 8.0 radial_tooth_length = 28.0 radial_tooth_width = 10.0 back_iron_width = stator_outer_radius - stator_inner_radius def make_stator_ring(): outer = cq.Workplane("XY").circle(stator_outer_radius).extrude(stator_thickness) inner_cut = cq.Workplane("XY").circle(stator_inner_radius).extrude(stator_thickness + 2).translate((0, 0, -1)) return outer.cut(inner_cut) def make_radial_tooth(index): angle = 360.0 * index / stator_slot_count tooth_center = stator_inner_radius + radial_tooth_length / 2.0 tooth = ( cq.Workplane("XY") .center(tooth_center, 0) .rect(radial_tooth_length, radial_tooth_width) .extrude(stator_thickness + 1.0) ) root_pad = ( cq.Workplane("XY") .center(stator_inner_radius + 2.0, 0) .rect(8.0, radial_tooth_width + 4.0) .extrude(stator_thickness + 1.0) ) return tooth.union(root_pad).rotate((0, 0, 0), (0, 0, 1), angle) def make_twelve_slot_tooth_set(): teeth = None for tooth_index in range(stator_slot_count): radial_tooth = make_radial_tooth(tooth_index) teeth = radial_tooth if teeth is None else teeth.union(radial_tooth) return teeth stator_ring = make_stator_ring() twelve_radial_teeth = make_twelve_slot_tooth_set() center_shaft_opening = cq.Workplane("XY").circle(stator_shaft_radius).extrude(stator_thickness + 4).translate((0, 0, -2)) fixture = stator_ring.union(twelve_radial_teeth).cut(center_shaft_opening).clean() '''.strip(), }, } SPACE_HTML = r''' CADForge RLVE

OpenEnv + CadQuery + GRPO

CADForge RLVE

Frontier models can describe CAD, but tiny models often fail at executable, editable CADQuery. CADForge turns that gap into an RL environment: write code, compile real geometry, receive reward, repair, and improve.

Buildable CAD preview

Choose a task. CADForge will score a weak seed, repair it, verify it, and render the improved STL.

weak seed repaired CAD
Step 0Waiting for weak seed.
Step 1Waiting for repaired CAD.
build--
reward--
editability--
semantic--

Judge rerun links

The full CADForge SFT and GRPO runs were executed on a RunPod H200 as distinct production scripts. The Colab notebook is the public smoke path: it validates OpenEnv, loads the public dataset, runs the real CadQuery reward backend, and launches tiny SFT/GRPO checks with the same source files.

The environment fights back

CADForge is not a static benchmark. The first dense GRPO run exposed a reward flaw: the model could receive partial reward while still failing to build. The environment adapted. Buildability became the first gate, failed code became negative reward, and each failure type became a curriculum target.

1. Observe failure

SyntaxError, missing fixture, invented API, disconnected parts, clipped final union, weak semantic match.

2. Generate curriculum

Failed trajectories become new repair tasks: fix one concrete CAD failure and improve the reward delta.

3. Train harder rollouts

GRPO groups compare buildable vs broken candidates, giving the model clean advantage signals.

Real training evidence

We ran seven distinct training experiments on RunPod H200. The important story is not just that loss went down; it is that the environment exposed reward hacking, then build-gated GRPO and adaptive repair made buildable CAD separate from broken code.

RunWhat happenedLesson
1. Qwen3.5-2B SFTtrain loss 1.4480 to 0.1658; eval loss 0.4477 to 0.26762B learned CadQuery grammar and trace format.
2. Qwen3.5-2B dense GRPO160 completions; 0.0% build rate; mean/best reward 0.3387 / 0.5303Reward was learnable, but too hackable without a hard build gate.
3. Qwen3.5-9B SFTtrain loss 2.6020 to 0.1413; eval loss 0.3650 to 0.23989B learned syntax and structure faster than 2B.
4. Qwen3.5-9B dense GRPO160 completions; 0.0% build rate; mean/best reward 0.4355 / 0.6828Bigger model got higher scalar reward while still failing buildability.
5. Qwen3.5-9B strict GRPO320 completions; 96 buildable; best CADForge score 0.9352Buildability-first reward produced the first real breakthrough.
6. Adaptive repair v1120 repair completions; 0 buildable; clipped-output pattern exposedThe environment found a curriculum/completion-length bug.
7. Adaptive repair final 8192180 repair completions; 53 buildable; 0 clipped completions; best reward 0.882Failure mining plus longer completions recovered buildable repairs.
Best downloadable model adapter

Use the final Qwen3.5-9B adaptive-repair LoRA to test CADQuery generation and repair locally or on a GPU notebook.

Download best model

Held-out eval after strict GRPO built 2 of 3 generated CadQuery files successfully. The remaining failed chair case clipped before the final assembly, which directly motivated the adaptive repair run.

Reward hacking and reward design

CADForge started with dense rewards for code shape, semantic words, topology, contact, reference similarity, and editability. Training showed a classic reward-hacking pattern: models could earn positive-looking reward while still producing non-buildable CAD. The fix was to make buildability the first gate.

What was hackable

Dense reward gave partial credit for code-like text, named parts, and semantic hints even when CadQuery failed to export an STL.

What fixed it

Strict GRPO makes failed builds negative. Dense topology, semantic, contact, reference, and editability scores unlock only after the CAD builds.

Step rewards

Each action returns reward JSON: build, topology, contact, semantic parts, reference similarity, editability, efficiency, and verifier notes.

{
  "build": 1.0,
  "topology": 0.82,
  "contact": 0.74,
  "semantic_parts": 0.61,
  "reference_similarity": 0.58,
  "editability": 0.80,
  "total": 0.86,
  "notes": ["candidate builds", "recognizable task parts", "clean fixture"]
}

Space APIs

The Space is both a demo and an OpenEnv-style reward service. A model can submit CadQuery code, receive structured observations, and use those step rewards for SFT data generation, GRPO rollouts, or human-readable debugging.

GET /healthzHealth check for the CADForge Space.
POST /api/space/repair-loopRuns the demo loop: weak seed, repaired CAD, CadQuery build, reward JSON, and STL artifact URLs.
POST /api/space/demoScores a known buildable candidate and returns reward dimensions plus artifact paths.
GET /api/space/loop-stl/{task_id}Downloads the repaired STL from the most recent repair-loop run.
GET /api/space/loop-stl/{task_id}/{step_id}Downloads a specific weak-seed or repaired-step STL for visual comparison.
OpenEnv step routeThe OpenEnv server wraps complete CadQuery Python files as actions and returns observations with reward JSON and verifier notes.

Theme alignment

Long-horizon planning

CAD is built through repeated code edits, reward observations, and repairs rather than one-shot text generation.

Professional world modeling

The agent interacts with real CadQuery execution, STL export, mesh checks, reference metrics, and persistent state.

Self-improvement

The curriculum adapts to model failures: build errors and weak semantics become the next tasks the model must learn to repair.

Reward JSON will appear here after the repair loop runs.
''' def _safe_task_id(value: str) -> str: return "".join(ch for ch in value if ch.isalnum() or ch in {"_", "-"}).strip() or "caster_wheel_fork" def _artifact_dir(task_id: str) -> Path: return APP_ROOT / "runs" / "cadquery-env" / f"space-demo-{_safe_task_id(task_id)}" / "sample" def _loop_artifact_dir(task_id: str, step_id: str) -> Path: return APP_ROOT / "runs" / "cadquery-env" / f"space-loop-{_safe_task_id(task_id)}" / _safe_task_id(step_id) def _stl_path(artifact_dir: Path) -> Path: stl_path = artifact_dir / "candidate_normalized.stl" if not stl_path.exists(): stl_path = artifact_dir / "candidate.stl" return stl_path def _step_summary(result: dict, fallback: str) -> str: notes = result.get("notes") if isinstance(notes, list) and notes: return str(notes[0])[:140] error = result.get("error") if error: return str(error)[:140] return fallback def _step_payload(name: str, result: dict, fallback: str) -> dict: reward = result.get("reward", {}) if isinstance(result.get("reward"), dict) else {} return { "name": name, "ok": bool(result.get("ok")), "build": float(reward.get("build", 0.0) or 0.0), "reward": float(reward.get("total", 0.0) or 0.0), "editability": float(reward.get("editability", 0.0) or 0.0), "semantic_parts": float(reward.get("semantic_parts", 0.0) or 0.0), "summary": _step_summary(result, fallback), } @app.get("/", response_class=HTMLResponse) @app.get("/web", response_class=HTMLResponse) def space_home() -> HTMLResponse: return HTMLResponse(SPACE_HTML) @app.get("/healthz") def healthz() -> dict[str, str]: return {"ok": "true", "env": "cadforge_cadquery"} @app.post("/api/space/demo") async def run_space_demo(request: Request) -> JSONResponse: payload = await request.json() task_id = _safe_task_id(str(payload.get("task_id") or "caster_wheel_fork")) demo = DEMO_TASKS.get(task_id) or DEMO_TASKS["caster_wheel_fork"] task_spec = read_task_spec(task_id) or {"id": task_id, "prompt": demo["prompt"]} reference_root = APP_ROOT / "data" / "reference-metrics" / task_id result = evaluate_code( demo["code"], f"space-demo-{task_id}", "sample", task_prompt=str(task_spec.get("prompt") or demo["prompt"]), reference_root=reference_root, reward_mode="fast", task_spec=task_spec, ) reward = result.get("reward", {}) artifact_dir = Path(str(result.get("artifacts_dir") or _artifact_dir(task_id))) stl_path = _stl_path(artifact_dir) if not result.get("ok") or not stl_path.exists(): return JSONResponse( { "ok": False, "task_id": task_id, "label": demo["label"], "error": result.get("error", "CadQuery build failed"), "notes": result.get("notes", []), "reward": reward, }, status_code=500, ) return JSONResponse( { "ok": True, "task_id": task_id, "label": demo["label"], "prompt": demo["prompt"], "reward": reward, "notes": result.get("notes", []), "elapsed_ms": result.get("elapsed_ms", 0), "stl_url": f"/api/space/stl/{task_id}", "artifacts_dir": result.get("artifacts_dir"), } ) @app.post("/api/space/repair-loop") async def run_space_repair_loop(request: Request) -> JSONResponse: payload = await request.json() task_id = _safe_task_id(str(payload.get("task_id") or "caster_wheel_fork")) demo = DEMO_TASKS.get(task_id) or DEMO_TASKS["caster_wheel_fork"] task_spec = read_task_spec(task_id) or {"id": task_id, "prompt": demo["prompt"]} task_prompt = str(task_spec.get("prompt") or demo["prompt"]) reference_root = APP_ROOT / "data" / "reference-metrics" / task_id broken = evaluate_code( demo["broken_code"], f"space-loop-{task_id}", "broken", task_prompt=task_prompt, reference_root=reference_root, reward_mode="fast", task_spec=task_spec, ) repaired = evaluate_code( demo["code"], f"space-loop-{task_id}", "repaired", task_prompt=task_prompt, reference_root=reference_root, reward_mode="fast", task_spec=task_spec, ) repaired_reward = repaired.get("reward", {}) if isinstance(repaired.get("reward"), dict) else {} broken_reward = broken.get("reward", {}) if isinstance(broken.get("reward"), dict) else {} artifact_dir = Path(str(repaired.get("artifacts_dir") or _loop_artifact_dir(task_id, "repaired"))) stl_path = _stl_path(artifact_dir) steps = [ _step_payload("weak seed", broken, "CADForge scores the weak first attempt before repair."), _step_payload("repaired CAD", repaired, "The repaired candidate is rebuilt and rescored."), ] if not repaired.get("ok") or not stl_path.exists(): return JSONResponse( { "ok": False, "task_id": task_id, "label": demo["label"], "prompt": demo["prompt"], "steps": steps, "error": repaired.get("error", "Repaired CadQuery build failed"), "final": {"reward": repaired_reward}, }, status_code=500, ) return JSONResponse( { "ok": True, "task_id": task_id, "label": demo["label"], "prompt": demo["prompt"], "delta_reward": float(repaired_reward.get("total", 0.0) or 0.0) - float(broken_reward.get("total", 0.0) or 0.0), "steps": steps, "seed": { "reward": broken_reward, "notes": broken.get("notes", []), "elapsed_ms": broken.get("elapsed_ms", 0), "stl_url": f"/api/space/loop-stl/{task_id}/broken", "artifacts_dir": broken.get("artifacts_dir"), }, "final": { "reward": repaired_reward, "notes": repaired.get("notes", []), "elapsed_ms": repaired.get("elapsed_ms", 0), "stl_url": f"/api/space/loop-stl/{task_id}", "artifacts_dir": repaired.get("artifacts_dir"), }, } ) @app.get("/api/space/stl/{task_id}") def get_space_stl(task_id: str) -> FileResponse: safe_task = _safe_task_id(task_id) artifact_dir = _artifact_dir(safe_task) stl_path = _stl_path(artifact_dir) if not stl_path.exists(): raise HTTPException(status_code=404, detail="Run the demo verifier first.") return FileResponse(stl_path, media_type="model/stl", filename=f"{safe_task}.stl") @app.get("/api/space/loop-stl/{task_id}") def get_space_loop_stl(task_id: str) -> FileResponse: safe_task = _safe_task_id(task_id) stl_path = _stl_path(_loop_artifact_dir(safe_task, "repaired")) if not stl_path.exists(): raise HTTPException(status_code=404, detail="Run the repair loop first.") return FileResponse(stl_path, media_type="model/stl", filename=f"{safe_task}-repaired.stl") @app.get("/api/space/loop-stl/{task_id}/{step_id}") def get_space_loop_step_stl(task_id: str, step_id: str) -> FileResponse: safe_task = _safe_task_id(task_id) safe_step = _safe_task_id(step_id) stl_path = _stl_path(_loop_artifact_dir(safe_task, safe_step)) if not stl_path.exists(): raise HTTPException(status_code=404, detail="Run the repair loop first.") return FileResponse(stl_path, media_type="model/stl", filename=f"{safe_task}-{safe_step}.stl") def main(host: str = "0.0.0.0", port: int = 8000) -> None: import uvicorn uvicorn.run(app, host=host, port=port) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--port", type=int, default=8000) args = parser.parse_args() if args.port == 8000: main() else: main(port=args.port)