cicd-diagnosis-env / inference.py
Jayant-Kernel
fix: eliminate all 0.0 scores - clamp defaults and error fallbacks to 0.01
657da66
# inference.py — run this from the project root
import json
import os
import sys
from openai import OpenAI
from cicd_diagnosis_env.client import CICDEnv
from cicd_diagnosis_env.models import DiagnoseAction
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
HF_TOKEN = os.getenv("HF_TOKEN")
ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")
NUM_EPISODES = int(os.getenv("NUM_EPISODES", "10"))
MAX_STEPS = 3
_api_key = HF_TOKEN or os.getenv("OPENAI_API_KEY", "no-key")
llm = OpenAI(api_key=_api_key, base_url=API_BASE_URL)
_SYSTEM = """You are an expert CI/CD engineer diagnosing pipeline failures.
Respond ONLY with valid JSON (no markdown):
{
"failure_category": "<dependency|config|flaky|code_bug|infra>",
"root_cause": "<concise 1-2 sentence explanation>",
"suggested_fix": "<concrete action>",
"confidence": <float 0.0-1.0>
}"""
def log_start(task, env, model):
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(step, action, reward, done, error=None):
err = error if error else "null"
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={err}", flush=True)
def log_end(success, steps, score, rewards):
rstr = ",".join(f"{r:.2f}" for r in rewards)
print(f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rstr}", flush=True)
def call_llm(log, summary):
msg = f"Error summary: {summary}\n\nFull log:\n{log}"
resp = llm.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "system", "content": _SYSTEM}, {"role": "user", "content": msg}],
temperature=0.2,
max_tokens=300,
)
raw = resp.choices[0].message.content.strip()
if raw.startswith("```"):
lines = raw.splitlines()
raw = "\n".join(lines[1:-1]).strip()
parsed = json.loads(raw)
return DiagnoseAction(
failure_category=parsed["failure_category"],
root_cause=parsed["root_cause"],
suggested_fix=parsed["suggested_fix"],
confidence=float(parsed.get("confidence", 0.8)),
)
def run_episode(env, ep_num):
obs = env.reset()
task_name = f"task{obs.task_id}"
log_start(task=task_name, env="cicd_diagnosis_env", model=MODEL_NAME)
rewards = []
steps = 0
success = False
score = 0.01
for step in range(1, MAX_STEPS + 1):
error = None
try:
action = call_llm(obs.pipeline_log, obs.error_summary)
action_str = f"diagnose(category={action.failure_category})"
except Exception as e:
error = str(e)
action_str = "diagnose(error)"
log_step(step, action_str, 0.01, True, error)
rewards.append(0.01)
steps = step
break
obs = env.step(action)
reward = obs.reward if obs.reward is not None else 0.01
rewards.append(reward)
steps = step
score = obs.score if obs.score > 0 else 0.01
log_step(step, action_str, reward, obs.done, error)
if obs.done:
success = score >= 0.5
break
log_end(success=success, steps=steps, score=score, rewards=rewards)
return score
def main():
env = CICDEnv(base_url=ENV_URL)
for ep in range(1, NUM_EPISODES + 1):
try:
run_episode(env, ep)
except Exception as e:
print(f"[DEBUG] episode {ep} error: {e}", file=sys.stderr, flush=True)
log_end(success=False, steps=0, score=0.01, rewards=[0.01])
if __name__ == '__main__':
main()