from __future__ import annotations import json from pathlib import Path from fastapi import HTTPException, Request from fastapi.responses import FileResponse, HTMLResponse, JSONResponse from openenv.core.env_server.http_server import create_app from cadquery_env import APP_ROOT, evaluate_code, read_task_spec from .cadforge_environment import CadForgeCadQueryEnvironment from .openenv_models import CadForgeAction, CadForgeObservation app = create_app( CadForgeCadQueryEnvironment, CadForgeAction, CadForgeObservation, env_name="cadforge_cadquery", max_concurrent_envs=4, ) DEMO_TASKS = { "caster_wheel_fork": { "label": "Caster wheel fork", "prompt": "Design a small caster wheel assembly as editable code-CAD. Include a wheel, axle, U-shaped fork, swivel stem, and top mounting plate with four holes.", "broken_code": r''' import cadquery as cq # Weak seed: buildable blank plate with almost no requested assembly detail. plate = cq.Workplane("XY").box(44, 44, 4).translate((0, 0, 2)) fixture = plate.clean() '''.strip(), "code": r''' import cadquery as cq plate_w = 44.0 plate_d = 44.0 plate_t = 4.0 stem_d = 10.0 stem_h = 16.0 fork_inside_w = 22.0 fork_leg_t = 4.0 fork_leg_depth = 18.0 fork_leg_h = 24.0 wheel_d = 22.0 wheel_w = 10.0 axle_d = 4.2 def make_top_plate(): plate = cq.Workplane("XY").box(plate_w, plate_d, plate_t).translate((0, 0, plate_t / 2)) holes = ( cq.Workplane("XY") .pushPoints([(-14, -14), (-14, 14), (14, -14), (14, 14)]) .circle(2.1) .extrude(plate_t + 2) .translate((0, 0, -1)) ) return plate.cut(holes) def make_fork(): left_x = -(fork_inside_w / 2 + fork_leg_t / 2) right_x = fork_inside_w / 2 + fork_leg_t / 2 left_leg = cq.Workplane("XY").box(fork_leg_t, fork_leg_depth, fork_leg_h).translate((left_x, 0, plate_t + fork_leg_h / 2 - 1)) right_leg = cq.Workplane("XY").box(fork_leg_t, fork_leg_depth, fork_leg_h).translate((right_x, 0, plate_t + fork_leg_h / 2 - 1)) bridge = cq.Workplane("XY").box(fork_inside_w + 2 * fork_leg_t, fork_leg_depth, 4).translate((0, 0, plate_t + 1.5)) return left_leg.union(right_leg).union(bridge) plate = make_top_plate() stem = cq.Workplane("XY").cylinder(stem_h, stem_d / 2).translate((0, 0, plate_t + stem_h / 2 - 0.5)) fork = make_fork() wheel = cq.Workplane("XY").cylinder(wheel_w, wheel_d / 2).cut(cq.Workplane("XY").cylinder(wheel_w + 2, 2.5)).translate((0, 0, 18)) axle = cq.Workplane("XY").cylinder(fork_inside_w + 3, axle_d / 2).translate((0, 0, 18)) fixture = plate.union(stem).union(fork).union(wheel).union(axle).clean() '''.strip(), }, "axial_motor_stator_12_slot": { "label": "12-slot motor stator", "prompt": "Design a simple 12-slot axial motor stator concept with a circular ring, radial teeth, and center shaft opening.", "broken_code": r''' import cadquery as cq # Weak seed: buildable disk with a center bore, but no teeth or slot structure. outer_radius = 60.0 shaft_radius = 12.0 thickness = 8.0 disk = cq.Workplane("XY").circle(outer_radius).extrude(thickness) bore = cq.Workplane("XY").circle(shaft_radius).extrude(thickness + 2).translate((0, 0, -1)) fixture = disk.cut(bore).clean() '''.strip(), "code": r''' import cadquery as cq # Editable axial motor stator concept with twelve radial teeth and center bore. stator_slot_count = 12 stator_outer_radius = 60.0 stator_inner_radius = 24.0 stator_shaft_radius = 11.0 stator_thickness = 8.0 radial_tooth_length = 28.0 radial_tooth_width = 10.0 back_iron_width = stator_outer_radius - stator_inner_radius def make_stator_ring(): outer = cq.Workplane("XY").circle(stator_outer_radius).extrude(stator_thickness) inner_cut = cq.Workplane("XY").circle(stator_inner_radius).extrude(stator_thickness + 2).translate((0, 0, -1)) return outer.cut(inner_cut) def make_radial_tooth(index): angle = 360.0 * index / stator_slot_count tooth_center = stator_inner_radius + radial_tooth_length / 2.0 tooth = ( cq.Workplane("XY") .center(tooth_center, 0) .rect(radial_tooth_length, radial_tooth_width) .extrude(stator_thickness + 1.0) ) root_pad = ( cq.Workplane("XY") .center(stator_inner_radius + 2.0, 0) .rect(8.0, radial_tooth_width + 4.0) .extrude(stator_thickness + 1.0) ) return tooth.union(root_pad).rotate((0, 0, 0), (0, 0, 1), angle) def make_twelve_slot_tooth_set(): teeth = None for tooth_index in range(stator_slot_count): radial_tooth = make_radial_tooth(tooth_index) teeth = radial_tooth if teeth is None else teeth.union(radial_tooth) return teeth stator_ring = make_stator_ring() twelve_radial_teeth = make_twelve_slot_tooth_set() center_shaft_opening = cq.Workplane("XY").circle(stator_shaft_radius).extrude(stator_thickness + 4).translate((0, 0, -2)) fixture = stator_ring.union(twelve_radial_teeth).cut(center_shaft_opening).clean() '''.strip(), }, } SPACE_HTML = r'''
OpenEnv + CadQuery + GRPO
Frontier models can describe CAD, but tiny models often fail at executable, editable CADQuery. CADForge turns that gap into an RL environment: write code, compile real geometry, receive reward, repair, and improve.
Choose a task. CADForge will score a weak seed, repair it, verify it, and render the improved STL.
CADForge is not a static benchmark. The first dense GRPO run exposed a reward flaw: the model could receive partial reward while still failing to build. The environment adapted. Buildability became the first gate, failed code became negative reward, and each failure type became a curriculum target.
SyntaxError, missing fixture, invented API, disconnected parts, clipped final union, weak semantic match.
Failed trajectories become new repair tasks: fix one concrete CAD failure and improve the reward delta.
GRPO groups compare buildable vs broken candidates, giving the model clean advantage signals.
We ran seven distinct training experiments on RunPod H200. The important story is not just that loss went down; it is that the environment exposed reward hacking, then build-gated GRPO and adaptive repair made buildable CAD separate from broken code.
| Run | What happened | Lesson |
|---|---|---|
| 1. Qwen3.5-2B SFT | train loss 1.4480 to 0.1658; eval loss 0.4477 to 0.2676 | 2B learned CadQuery grammar and trace format. |
| 2. Qwen3.5-2B dense GRPO | 160 completions; 0.0% build rate; mean/best reward 0.3387 / 0.5303 | Reward was learnable, but too hackable without a hard build gate. |
| 3. Qwen3.5-9B SFT | train loss 2.6020 to 0.1413; eval loss 0.3650 to 0.2398 | 9B learned syntax and structure faster than 2B. |
| 4. Qwen3.5-9B dense GRPO | 160 completions; 0.0% build rate; mean/best reward 0.4355 / 0.6828 | Bigger model got higher scalar reward while still failing buildability. |
| 5. Qwen3.5-9B strict GRPO | 320 completions; 96 buildable; best CADForge score 0.9352 | Buildability-first reward produced the first real breakthrough. |
| 6. Adaptive repair v1 | 120 repair completions; 0 buildable; clipped-output pattern exposed | The environment found a curriculum/completion-length bug. |
| 7. Adaptive repair final 8192 | 180 repair completions; 53 buildable; 0 clipped completions; best reward 0.882 | Failure mining plus longer completions recovered buildable repairs. |
Use the final Qwen3.5-9B adaptive-repair LoRA to test CADQuery generation and repair locally or on a GPU notebook.
Held-out eval after strict GRPO built 2 of 3 generated CadQuery files successfully. The remaining failed chair case clipped before the final assembly, which directly motivated the adaptive repair run.
CADForge started with dense rewards for code shape, semantic words, topology, contact, reference similarity, and editability. Training showed a classic reward-hacking pattern: models could earn positive-looking reward while still producing non-buildable CAD. The fix was to make buildability the first gate.
Dense reward gave partial credit for code-like text, named parts, and semantic hints even when CadQuery failed to export an STL.
Strict GRPO makes failed builds negative. Dense topology, semantic, contact, reference, and editability scores unlock only after the CAD builds.
Each action returns reward JSON: build, topology, contact, semantic parts, reference similarity, editability, efficiency, and verifier notes.
{
"build": 1.0,
"topology": 0.82,
"contact": 0.74,
"semantic_parts": 0.61,
"reference_similarity": 0.58,
"editability": 0.80,
"total": 0.86,
"notes": ["candidate builds", "recognizable task parts", "clean fixture"]
}
The Space is both a demo and an OpenEnv-style reward service. A model can submit CadQuery code, receive structured observations, and use those step rewards for SFT data generation, GRPO rollouts, or human-readable debugging.
GET /healthzHealth check for the CADForge Space.POST /api/space/repair-loopRuns the demo loop: weak seed, repaired CAD, CadQuery build, reward JSON, and STL artifact URLs.POST /api/space/demoScores a known buildable candidate and returns reward dimensions plus artifact paths.GET /api/space/loop-stl/{task_id}Downloads the repaired STL from the most recent repair-loop run.GET /api/space/loop-stl/{task_id}/{step_id}Downloads a specific weak-seed or repaired-step STL for visual comparison.OpenEnv step routeThe OpenEnv server wraps complete CadQuery Python files as actions and returns observations with reward JSON and verifier notes.CAD is built through repeated code edits, reward observations, and repairs rather than one-shot text generation.
The agent interacts with real CadQuery execution, STL export, mesh checks, reference metrics, and persistent state.
The curriculum adapts to model failures: build errors and weak semantics become the next tasks the model must learn to repair.
Reward JSON will appear here after the repair loop runs.