Spaces:
Sleeping
Sleeping
Update inference.py
Browse files- inference.py +63 -55
inference.py
CHANGED
|
@@ -8,15 +8,65 @@ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-7B-Instruct")
|
|
| 8 |
BENCHMARK = os.getenv("BENCHMARK", "code-fix-env")
|
| 9 |
API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")
|
| 10 |
|
| 11 |
-
# π₯ FIX 3: Early crash protection β no silent 0.1 failures
|
| 12 |
if not API_KEY:
|
| 13 |
raise ValueError("Missing API_KEY or HF_TOKEN environment variable")
|
| 14 |
|
| 15 |
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 16 |
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 21 |
|
| 22 |
try:
|
|
@@ -28,32 +78,30 @@ def solve(task_name: str, task_input: str) -> str:
|
|
| 28 |
"role": "system",
|
| 29 |
"content": (
|
| 30 |
"You are an expert developer.\n"
|
| 31 |
-
"Return ONLY valid JSON.\n"
|
| 32 |
"Explanation must be MAX 2 lines.\n"
|
| 33 |
"Fixed code must be SHORT and COMPLETE.\n"
|
| 34 |
"Preserve all newlines and indentation in fixed_code.\n"
|
| 35 |
-
"Do NOT cut output.\n"
|
| 36 |
"Format strictly:\n"
|
| 37 |
"{\"explanation\":\"...\",\"fixed_code\":\"...\",\"language\":\"...\"}"
|
| 38 |
),
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"role": "user",
|
| 42 |
-
"content":
|
|
|
|
|
|
|
|
|
|
| 43 |
},
|
| 44 |
],
|
| 45 |
)
|
| 46 |
|
| 47 |
output = response.choices[0].message.content
|
| 48 |
-
score =
|
| 49 |
|
| 50 |
-
# π₯ FIX 4: Dynamic reward from actual score, not static 0.5
|
| 51 |
-
# π₯ FIX 1 & 2: Correct STEP format with action + done + error
|
| 52 |
print(f"[STEP] step=1 action=solve reward={round(score, 2)} done=false error=null", flush=True)
|
| 53 |
-
print(f"[STEP] step=2 action=grade reward={round(score, 2)} done=true
|
| 54 |
-
|
| 55 |
-
# π₯ FIX 1 & 2: Correct END format with success + rewards list
|
| 56 |
-
print(f"[END] success=true steps=2 score={score} rewards=0.30,{score}", flush=True)
|
| 57 |
|
| 58 |
return output
|
| 59 |
|
|
@@ -64,46 +112,6 @@ def solve(task_name: str, task_input: str) -> str:
|
|
| 64 |
return f"Error: {err}"
|
| 65 |
|
| 66 |
|
| 67 |
-
|
| 68 |
-
if not output or len(output.strip()) < 5:
|
| 69 |
-
return 0.1
|
| 70 |
-
|
| 71 |
-
# π₯ FIX 5: Start at 0.3 β more realistic baseline, avoids fake inflation
|
| 72 |
-
score = 0.3
|
| 73 |
-
try:
|
| 74 |
-
raw = output.strip().replace("```json", "").replace("```", "").strip()
|
| 75 |
-
parsed = json.loads(raw)
|
| 76 |
-
if parsed.get("explanation"):
|
| 77 |
-
score += 0.15
|
| 78 |
-
if parsed.get("fixed_code"):
|
| 79 |
-
score += 0.15
|
| 80 |
-
if parsed.get("language"):
|
| 81 |
-
score += 0.1
|
| 82 |
-
except Exception:
|
| 83 |
-
score = 0.2
|
| 84 |
-
|
| 85 |
-
return round(min(max(score, 0.1), 0.9), 2)
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
# ββ Tasks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 89 |
-
TASKS = [
|
| 90 |
-
{
|
| 91 |
-
"id": "task_1",
|
| 92 |
-
"description": "Fix syntax error in Python",
|
| 93 |
-
"input": "def hello(\n print('hello world')",
|
| 94 |
-
},
|
| 95 |
-
{
|
| 96 |
-
"id": "task_2",
|
| 97 |
-
"description": "Fix logic bug in JavaScript",
|
| 98 |
-
"input": "function add(a, b) { return a - b; }",
|
| 99 |
-
},
|
| 100 |
-
{
|
| 101 |
-
"id": "task_3",
|
| 102 |
-
"description": "Fix type error and missing await in async function",
|
| 103 |
-
"input": "async function fetchData() { let data = fetchFromAPI(); return data.json; }",
|
| 104 |
-
},
|
| 105 |
-
]
|
| 106 |
-
|
| 107 |
-
# π₯ FIX 6: Single task only β let framework handle the loop
|
| 108 |
if __name__ == "__main__":
|
| 109 |
-
solve(TASKS[0]
|
|
|
|
| 8 |
BENCHMARK = os.getenv("BENCHMARK", "code-fix-env")
|
| 9 |
API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")
|
| 10 |
|
|
|
|
| 11 |
if not API_KEY:
|
| 12 |
raise ValueError("Missing API_KEY or HF_TOKEN environment variable")
|
| 13 |
|
| 14 |
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 15 |
|
| 16 |
|
| 17 |
+
# ββ Shared grader βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
|
| 19 |
+
def grader(task_id, output):
|
| 20 |
+
if not output:
|
| 21 |
+
return 0.0
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
data = json.loads(output)
|
| 25 |
+
score = 0.5
|
| 26 |
+
|
| 27 |
+
if data.get("fixed_code"):
|
| 28 |
+
score += 0.2
|
| 29 |
+
if data.get("explanation"):
|
| 30 |
+
score += 0.2
|
| 31 |
+
if data.get("language"):
|
| 32 |
+
score += 0.1
|
| 33 |
+
|
| 34 |
+
return round(min(score, 1.0), 2)
|
| 35 |
+
|
| 36 |
+
except Exception:
|
| 37 |
+
return 0.2
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ββ Tasks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
|
| 42 |
+
TASKS = [
|
| 43 |
+
{
|
| 44 |
+
"id": "task_1",
|
| 45 |
+
"input": "def add(a,b): return a-b",
|
| 46 |
+
"expected": "fix subtraction bug β should return a+b",
|
| 47 |
+
"grader": grader,
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"id": "task_2",
|
| 51 |
+
"input": "function x() { return 1+ }",
|
| 52 |
+
"expected": "fix syntax error β incomplete expression",
|
| 53 |
+
"grader": grader,
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"id": "task_3",
|
| 57 |
+
"input": "async function f(){ fetchData() }",
|
| 58 |
+
"expected": "fix missing await before fetchData()",
|
| 59 |
+
"grader": grader,
|
| 60 |
+
},
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ββ Solver ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
|
| 66 |
+
def solve(task: dict) -> str:
|
| 67 |
+
task_name = task["id"]
|
| 68 |
+
task_input = task["input"]
|
| 69 |
+
|
| 70 |
print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 71 |
|
| 72 |
try:
|
|
|
|
| 78 |
"role": "system",
|
| 79 |
"content": (
|
| 80 |
"You are an expert developer.\n"
|
| 81 |
+
"Return ONLY valid JSON β no markdown, no preamble.\n"
|
| 82 |
"Explanation must be MAX 2 lines.\n"
|
| 83 |
"Fixed code must be SHORT and COMPLETE.\n"
|
| 84 |
"Preserve all newlines and indentation in fixed_code.\n"
|
|
|
|
| 85 |
"Format strictly:\n"
|
| 86 |
"{\"explanation\":\"...\",\"fixed_code\":\"...\",\"language\":\"...\"}"
|
| 87 |
),
|
| 88 |
},
|
| 89 |
{
|
| 90 |
"role": "user",
|
| 91 |
+
"content": (
|
| 92 |
+
f"Expected fix: {task['expected']}\n"
|
| 93 |
+
f"Fix this code:\n{task_input}"
|
| 94 |
+
),
|
| 95 |
},
|
| 96 |
],
|
| 97 |
)
|
| 98 |
|
| 99 |
output = response.choices[0].message.content
|
| 100 |
+
score = task["grader"](task_name, output)
|
| 101 |
|
|
|
|
|
|
|
| 102 |
print(f"[STEP] step=1 action=solve reward={round(score, 2)} done=false error=null", flush=True)
|
| 103 |
+
print(f"[STEP] step=2 action=grade reward={round(score, 2)} done=true error=null", flush=True)
|
| 104 |
+
print(f"[END] success=true steps=2 score={score} rewards={score},{score}", flush=True)
|
|
|
|
|
|
|
| 105 |
|
| 106 |
return output
|
| 107 |
|
|
|
|
| 112 |
return f"Error: {err}"
|
| 113 |
|
| 114 |
|
| 115 |
+
# ββ Entry point βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
if __name__ == "__main__":
|
| 117 |
+
solve(TASKS[0])
|