Spaces:

aaloksan
/

kernel

Running

App Files Files Community

aaloksan commited on 9 days ago

Commit

c780f59

1 Parent(s): 0108333

kernel_v1

Browse files

Files changed (14) hide show

README.md +21 -13
app.py +61 -0
env_server.py +202 -0
inference.py +73 -0
models.py +49 -0
openenv.yaml +55 -0
openenv_train.py +70 -0
pyproject.toml +24 -0
requirements.txt +11 -0
runtime.txt +1 -0
server/__init__.py +1 -0
server/__pycache__/__init__.cpython-312.pyc +0 -0
server/__pycache__/app.cpython-312.pyc +0 -0
server/app.py +10 -0

README.md CHANGED Viewed

@@ -1,13 +1,21 @@
----
-title: Kernel
-emoji: 🏆
-colorFrom: gray
-colorTo: red
-sdk: gradio
-sdk_version: 6.12.0
-app_file: app.py
-pinned: false
-short_description: optimizes kernel code
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Kernel Writer
+CUDA kernel optimization
+## Run locally
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## Hugging Face Space setup
+Set the OpenAI key in Space **Settings → Variables and secrets** as:
+- `OPENAI_API_KEY`
+Optional:
+- `MODEL_NAME` (default: `gpt-4`)
+- `API_BASE_URL` (default: `https://api.openai.com/v1`)

app.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+from dotenv import load_dotenv
+from typing import Iterator, Tuple
+from env_server import KernelOptimization_env, TASKS
+from openai import OpenAI
+from models import Action
+import gradio as gr
+import traceback
+load_dotenv()
+def ui(task_id:str, max_steps:int, openai_api_key:str)-> Iterator[Tuple[str,str]]:
+    log= []
+    env=KernelOptimization_env()
+    api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        yield "ERROR: Missing OPENAI_API_KEY", ""
+        return
+    model = os.getenv("MODEL_NAME", "gpt-4")
+    client = OpenAI(api_key=api_key, base_url=os.getenv("API_BASE_URL", "https://api.openai.com/v1"))
+    obs = env.reset(task_id=task_id)["observation"]
+    best_code = obs["current_best_code"]
+    log.append(f"Task: {obs['task_name']}")
+    for _ in range(max_steps):
+        try:
+            prompt = f"Optimize CUDA code:\n{obs['current_best_code']}\nPending checks: {obs['pending_checks']}\nReturn code only."
+            res = client.chat.completions.create(
+                model=model,
+                temperature=0.0,
+                messages=[
+                    {"role": "system", "content": "Return only optimized CUDA code."},
+                    {"role": "user", "content": prompt},
+                ],
+            )
+            code = (res.choices[0].message.content or "").strip() or obs["current_best_code"]
+            step = env.step(Action(optimized_code=code, strategy="ui_proposed"))
+            obs = step.observation.model_dump()
+            best_code = obs["current_best_code"]
+            log.append(f"step={obs['step_count']} reward={step.reward.value:.3f} speedup={obs['current_best_speedup']:.3f}x")
+            yield "\n".join(log), best_code
+            if step.done:
+                break
+        except Exception as e:
+            yield f"{chr(10).join(log)}\nERROR: {e}\n{traceback.format_exc()}", best_code
+            return
+with gr.Blocks(title="CUDA Kernel Optimizer") as demo:
+    gr.Markdown("CUDA Kernel Optimizer - OpenEnv-aligned workflow")
+    task = gr.Dropdown(choices=list(TASKS.keys()), value="vector_add_easy", label="Task")
+    steps = gr.Slider(minimum=1, maximum=12, value=6, step=1, label="Max Steps")
+    key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...")
+    run = gr.Button("Run Optimization", variant="primary")
+    logs = gr.Textbox(label="Logs", lines=14)
+    code = gr.Code(label="Best Code", language="cpp", lines=16)
+    run.click(ui, inputs=[task, steps, key], outputs=[logs, code])
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

env_server.py ADDED Viewed

	@@ -0,0 +1,202 @@

+from typing import List, Optional, Dict, Any
+from models import Action, StepResult, ResetRequest, StepRequest, EnvState, Observation, Reward
+from fastapi import FastAPI, HTTPException
+import random
+TASKS: Dict[str, Dict[str, Any]] ={
+    "vector_add_easy": {
+        "name": "Vector Addition Kernel Optimization",
+        "difficulty": "easy",
+        "max_steps": 5,
+        "target_speedup": 1.8,
+        "baseline_code": """extern "C" __global__ void vector_add(const float* a, const float* b, float* c, int n)
+        {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx < n) c[idx] = a[idx] + b[idx];
+        }""",
+        "checks": {
+            "coalesced_memory": "Use memory-coalesced indexing",
+            "vectorized_loads": "Use vectorized loads/stores (float2/float4)",
+            "bounds_safe": "Keep safe boundary checks",
+        },
+    },
+    "matmul_medium": {
+        "name": "Matrix Multiplication Kernel Optimization",
+        "difficulty": "medium",
+        "max_steps": 6,
+        "target_speedup": 3.0,
+        "baseline_code": """extern "C" __global__ void matmul(const float* A, const float* B, float* C, int N)
+        {
+            int row = blockIdx.y * blockDim.y + threadIdx.y;
+            int col = blockIdx.x * blockDim.x + threadIdx.x;
+            if (row < N && col < N) {
+            float sum = 0.0f;
+            for (int k = 0; k < N; k++) sum += A[row * N + k] * B[k * N + col];
+            C[row * N + col] = sum;
+            }
+        }""",
+        "checks": {
+            "shared_tiling": "Use shared-memory tiling",
+            "synchronization": "Synchronize tiles with __syncthreads",
+            "register_accumulation": "Accumulate partial sums in registers",
+        },
+    },
+    "reduction_hard": {
+        "name": "Reduction Kernel Optimization",
+        "difficulty": "hard",
+        "max_steps":7,
+        "target_speedup": 3.5,
+        "baseline_code": """extern "C" __global__ void reduce_sum(const float* input, float* output, int n)
+        {
+            extern __shared__ float sdata[];
+            int tid = threadIdx.x;
+            int i = blockIdx.x * blockDim.x + threadIdx.x;
+            sdata[tid] = (i < n) ? input[i] : 0.0f;
+            __syncthreads();
+            for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+            if (tid < s) sdata[tid] += sdata[tid + s];
+            __syncthreads();
+            }
+            if (tid == 0) output[blockIdx.x] = sdata[0];
+        }""",
+        "checks": {
+            "warp_primitive": "Use warp-level primitive (e.g., __shfl_down_sync)",
+            "bank_conflict_reduction": "Reduce shared-memory bank conflicts",
+            "unrolled_reduction": "Use partial unrolling for final reduction",
+        },
+    }
+}
+def check_passed(check_id:str, code_lower:str) ->bool:
+    if check_id =="coalesced_memory":
+        return "idx" in code_lower and ("blockidx.x" in code_lower or "threadidx.x" in code_lower)
+    if check_id == "vectorized_loads":
+        return "float4" in code_lower or "float2" in code_lower
+    if check_id == "bounds_safe":
+        return "if" in code_lower and "< n" in code_lower
+    if check_id == "shared_tiling":
+        return "__shared__" in code_lower
+    if check_id == "synchronization":
+        return "__syncthreads" in code_lower
+    if check_id == "register_accumulation":
+        return "sum" in code_lower or "acc" in code_lower
+    if check_id == "warp_primitive":
+        return "__shfl_down_sync" in code_lower or "__shfl_sync" in code_lower
+    if check_id =="bank_conflict_reduction":
+        return "pad" in code_lower or "bank" in code_lower or "+ 1" in code_lower
+    if check_id == "unrolled_reduction":
+        return "#pragma unroll" in code_lower or "unroll" in code_lower
+    return False
+def to_observation(task_id:str, state:EnvState)->Observation:
+    task = TASKS[task_id]
+    pending = [desc for cid, desc in task["checks"].items() if cid not in set(state.completed_checks)]
+    return Observation(task_id=task_id, task_name=task["name"], difficulty=task["difficulty"], baseline_code=task["baseline_code"], current_best_code=state.best_code or task["baseline_code"], current_best_speedup=state.best_speedup, step_count=state.step_count, max_steps=state.max_steps, pending_checks=pending, completed_checks=[task["checks"][cid] for cid in state.completed_checks if cid in task["checks"]], done=(len(pending) == 0 or state.step_count >= state.max_steps))
+def grade_episode(task_id:str, completed_checks:List[str], best_speedup:float, step_count:int, max_steps:int)->float:
+    task=TASKS[task_id]
+    completion =len(completed_checks) / max(len(task["checks"]),1)
+    speedup_score = min(best_speedup /task["target_speedup"],1.0)
+    efficiency = max(0.0, 1.0 - ((step_count - 1) / max(max_steps, 1)))
+    return round(max(0.0, min(1.0, 0.5 * completion + 0.35 * speedup_score + 0.15 * efficiency)), 4)
+class KernelOptimization_env:
+    def __init__(self):
+        self.state =EnvState(initialized=False)
+        self.current_task_id: Optional[str]=None
+    def reset(self, task_id:Optional[str]=None)->Dict[str, Any]:
+        if task_id and task_id not in TASKS:
+            raise HTTPException(status_code=400, detail=f"unknown task_id: {task_id}")
+        self.current_task_id =task_id or random.choice(list(TASKS.keys()))
+        task= TASKS[self.current_task_id]
+        self.state =EnvState(initialized=True, task_id=self.current_task_id, step_count=0, max_steps=task["max_steps"], total_reward=0.0, best_code=task["baseline_code"], best_speedup=1.0, completed_checks=[], action_history=[])
+        return {"observation": to_observation(self.current_task_id, self.state).model_dump()}
+    def step(self, action:Action) ->StepResult:
+        if not self.state.initialized or not self.current_task_id:
+            raise HTTPException(status_code=400, detail="Environment not initialized. Call /reset first.")
+        self.state.step_count += 1
+        code = action.optimized_code or ""
+        code_lower = code.lower()
+        compile_ok = "__global__" in code_lower and "{" in code and "}" in code
+        completed = set(self.state.completed_checks)
+        newly_completed = {cid for cid in TASKS[self.current_task_id]["checks"] if cid not in completed and check_passed(cid, code_lower)}
+        completed.update(newly_completed)
+        self.state.completed_checks = sorted(completed)
+        est_speedup = self.current_task_id, completed, compile_ok
+        if est_speedup > self.state.best_speedup:
+            self.state.best_speedup = est_speedup
+            self.state.best_code = code
+        progress = 0.22 * len(newly_completed)
+        quality = 0.18 * min(self.state.best_speedup / TASKS[self.current_task_id]["target_speedup"], 1.0)
+        penalty = 0.0
+        if not compile_ok:
+            penalty -= 0.25
+        if not newly_completed:
+            penalty -= 0.08
+        reward_value = max(0.0, min(1.0, progress + quality + penalty))
+        self.state.total_reward += reward_value
+        self.state.action_history.append(
+            {
+                "step": self.state.step_count,
+                "newly_completed": sorted(newly_completed),
+                "compile_ok": compile_ok,
+                "estimated_speedup": est_speedup,
+                "reward": reward_value,
+            }
+        )
+        obs =to_observation(self.current_task_id, self.state)
+        info: Dict[str, Any] = { "compile_ok": compile_ok, "estimated_speedup": est_speedup}
+        if obs.done:
+            info["final_score"] = grade_episode(
+                self.current_task_id, self.state.completed_checks, self.state.best_speedup, self.state.step_count, self.state.max_steps
+            )
+        return StepResult(
+            observation=obs,
+            reward=Reward(
+                value=round(reward_value, 4),
+                components={"progress": round(progress, 4), "quality": round(quality, 4), "penalty": round(penalty, 4)},
+            ),
+            done=obs.done,
+            info=info,
+        )
+    def state_dict(self)->Dict[str, Any]:
+        data = self.state.model_dump()
+        if self.current_task_id:
+            data["task_name"] = TASKS[self.current_task_id]["name"]
+            data["difficulty"] = TASKS[self.current_task_id]["difficulty"]
+            data["grader_score"] = grade_episode(
+                self.current_task_id, self.state.completed_checks, self.state.best_speedup, self.state.step_count, self.state.max_steps
+            )
+        return data
+env=KernelOptimization_env()
+app=FastAPI(title="Kernel Optimization", version="1.0.0")
+@app.get("/")
+def health_check():
+    return {"status": "healthy", "service": "kernel-optimization-openenv"}
+@app.post("/reset")
+def reset(request: ResetRequest = ResetRequest()):
+    return env.reset(task_id=request.task_id)
+@app.post("/step")
+def step(request: StepRequest):
+    return env.step(request.action).model_dump()
+@app.get("/state")
+def state():
+    return env.state_dict()

inference.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+from openai import OpenAI, AuthenticationError
+from typing import Dict
+from env_server import TASKS, KernelOptimization_env, grade_episode
+from models import Action
+import json
+import sys
+from dotenv import load_dotenv
+load_dotenv()
+def extract_code(text: str) -> str:
+    if "```" not in text:
+        return text
+    start = text.find("```")
+    end = text.rfind("```")
+    chunk = text[start + 3 : end]
+    if chunk.startswith("cuda") or chunk.startswith("cpp"):
+        return chunk.split("\n", 1)[1]
+    return chunk
+def choose_action(client: OpenAI, model: str, observation: Dict) -> Action:
+    prompt = f"""Optimize this CUDA kernel.
+    Task: {observation['task_name']}
+    Pending checks: {observation['pending_checks']}
+    Baseline:
+    {observation['baseline_code']}
+    Current best speedup: {observation['current_best_speedup']}x
+    Return only optimized CUDA code.
+"""
+    response = client.chat.completions.create(
+        model=model,
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": "You are a CUDA optimization expert. Return code only."},
+            {"role": "user", "content": prompt},
+        ],
+    )
+    text = (response.choices[0].message.content or "").strip()
+    code = extract_code(text).strip() or observation["current_best_code"]
+    return Action(optimized_code=code, strategy="llm_proposed")
+def run_task(client: OpenAI, model: str, task_id: str) -> float:
+    env = KernelOptimization_env()
+    obs = env.reset(task_id=task_id)["observation"]
+    done = False
+    while not done:
+        action = choose_action(client, model, obs)
+        step_result = env.step(action)
+        obs = step_result.observation.model_dump()
+        done = step_result.done
+    return grade_episode(task_id, env.state.completed_checks, env.state.best_speedup, env.state.step_count, env.state.max_steps)
+def main()->int:
+    if not os.getenv("OPENAI_API_KEY"):
+        print("openai key not set")
+    model =os.getenv("MODEL_NAME", "gemma-3-4b")
+    client =OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url =os.getenv("API_BASE_URL", "https://api.oxlo.ai/v1"))
+    scores: Dict[str, float] = {}
+    try:
+        for task_id in TASKS:
+            scores[task_id] = run_task(client, model, task_id)
+            print(f"[TASK] {task_id} score={scores[task_id]:.4f}")
+    except AuthenticationError:
+        print("ERROR: OpenAI authentication failed. Check OPENAI_API_KEY.", file=sys.stderr)
+        return 1
+    avg = sum(scores.values()) / len(scores)
+    print(f"[BASELINE] model={model} average_score={avg:.4f}")
+    print(json.dumps({"scores": scores, "average": round(avg, 4)}))
+    return 0
+if __name__=="__main__":
+    sys.exit(main())

models.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from pydantic import BaseModel, Field
+from typing import Optional, Dict, Literal, List, Any
+class Action(BaseModel):
+    optimized_code: str
+    strategy: Optional[str] = None
+    expected_speedup: Optional[float] = None
+class Reward(BaseModel):
+    value: float = Field(ge=0.0, le=1.0)
+    components: Dict[str, float]
+class Observation(BaseModel):
+    task_id: str
+    task_name: str
+    difficulty: Literal["easy", "medium", "hard"]
+    baseline_code: str
+    current_best_code: str
+    current_best_speedup: float
+    step_count: int
+    max_steps: int
+    pending_checks: List[str]
+    completed_checks: List[str]
+    done: bool
+class EnvState(BaseModel):
+    initialized: bool
+    task_id: Optional[str] =None
+    step_count: int = 0
+    max_steps: int = 0
+    total_reward: float = 0.0
+    best_code: str = ""
+    best_speedup: float = 1.0
+    completed_checks: List[str] = Field(default_factory=list)
+    action_history: List[Dict[str, Any]] = Field(default_factory=list)
+class ResetRequest(BaseModel):
+    task_id: Optional[str] = None
+class StepRequest(BaseModel):
+    action: Action
+class StepResult(BaseModel):
+    observation:Observation
+    reward: Reward
+    done: bool
+    info: Dict[str, Any]

openenv.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+name: kernel_writer
+version: 1.0.0
+description: |
+  Real world CUDA kernel engineering environment for iterative optimization, code review checks and performance driven reward shaping.
+environment:
+  type: code_optimization
+  runtime: python3.12.3
+  containerized: true
+metadata:
+  tags:
+    - openenv
+    - CUDA
+    - kernel_optimization
+    - reinforcement_learning
+  author: aaloksan
+tasks:
+  - id: vector_addition_easy
+    name: "Vector Addition Kernel Optimization"
+    difficulty: easy
+    objective: "Improve memory throughput while preserving correctness."
+    grader: deterministic_rule_based
+  - id: matmul_medium
+    name: "Matrix Multiplication Kernel Optimization"
+    difficulty: medium
+    objective: "Apply shared-memory tiling and synchronization safely."
+    grader: deterministic_rule_based
+  - id: reduction_hard
+    name: "Reduction Kernel Optimization"
+    difficulty: hard
+    objective: "Use warp-level optimization and reduce memory conflicts."
+    grader: deterministic_rule_based
+interfaces:
+  reset:
+    method: POST
+    path: /reset
+    returns: initial observation and info
+  step:
+    method: POST
+    path: /step
+    returns: observation, reward, done, info
+  state:
+    method: GET
+    path: /state
+    returns: current environment state
+baseline:
+  script: inference.py
+  model_env_var: MODEL_NAME
+  api_key_env_var: OPENAI_API_KEY

openenv_train.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from env_server import KernelOptimization_env, TASKS
+from trl import GRPOConfig, GRPOTrainer
+from models import Action
+from typing import List
+from datasets import Dataset
+import os
+class KernelOptTool:
+    def __init__(self):
+        self.env = KernelOptimization_env()
+        self.reward = 0.0
+        self.done = False
+    def reset(self, **kwargs) ->str|None:
+        task_id =kwargs.get("task_id")
+        result = self.env.reset(task_id=task_id)
+        obs = result["observation"]
+        self.reward = 0.0
+        self.done = False
+        return (
+            f"Task: {obs['task_name']}\n"
+            f"Baseline CUDA kernel:\n{obs['baseline_code']}\n"
+            f"Pending checks: {obs['pending_checks']}\n"
+            "Use tools to submit improved code."
+        )
+    def submit_optiization(self, optimized_code:str, strategy:str ="")->str:
+        if self.done:
+            raise ValueError("Episode is already done.")
+        result = self.env.step(Action(optimized_code=optimized_code, strategy=strategy))
+        self.reward = result.reward.value
+        self.done = result.done
+        obs = result.observation
+        return (
+            f"reward={result.reward.value:.4f}, "
+            f"best_speedup={obs.current_best_speedup:.3f}x, "
+            f"pending_checks={obs.pending_checks}, done={result.done}"
+        )
+def reward_func(environmnets, **kwargs)-> List[float]:
+    return [env.reward for env in environmnets]
+def build_dataset(repeats_per_task:int=32)-> Dataset:
+    prompts, task_ids = [], []
+    for task_id, task in TASKS.items():
+        for _ in range(repeats_per_task):
+            prompts.append([{"role": "user", "content": f"Optimize CUDA kernel task: {task['name']}"}])
+            task_ids.append(task_id)
+    return Dataset.from_dict({"prompt": prompts, "task_id": task_ids})
+def main():
+    model_name =os.getenv("TRAIN_MODEL", "Qwen/Qwen3-0.6B")
+    dataset = build_dataset()
+    trainer = GRPOTrainer(
+        model=model_name,
+        train_dataset=dataset,
+        reward_funcs=reward_func,
+        environment_factory=KernelOptTool,
+        args=GRPOConfig(
+            chat_template_kwargs={"enable_thinking": False},
+            max_completion_length=2048,
+            num_generations=4,
+            log_completions=True,
+        ),
+    )
+    trainer.train()
+# trainer = GRPOTrainer(model =model_name, train_dataset=dataset, reward_funcs =reward_func, env_factory=KernelOptTool)
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,24 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "kernel_writer"
+version = "1.0.0"
+description = "OpenEnv-compatible CUDA kernel optimization environment."
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "fastapi>=0.110.0",
+  "uvicorn>=0.30.0",
+  "pydantic>=2.7.0",
+  "openai>=1.0.0",
+  "openenv-core>=0.2.0",
+  "python-dotenv>=1.0.0",
+  "gradio>=4.44.0",
+  "datasets>=2.20.0",
+  "trl>=0.12.0"
+]
+[project.scripts]
+server = "server.app:main"

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi>=0.110.0
+uvicorn>=0.30.0
+pydantic>=2.7.0
+openai>=1.0.0
+openenv-core>=0.2.0
+gradio>=4.44.0
+datasets>=2.20.0
+trl>=0.12.0
+pytest>=7.4.0
+pyyaml>=6.0.0
+python-dotenv

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.10

server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .app import app

server/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (173 Bytes). View file

server/__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (595 Bytes). View file

server/app.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+import uvicorn
+from env_server import app
+def main():
+    uvicorn.run("server.app:app", host="0.0.0.0", port=int(os.getenv("PORT", "7860")))
+if __name__ == "__main__":
+    main()