# src/submission/submit.py import json import asyncio import requests from openai import OpenAI import httpx from src.envs import ( USE_LM_STUDIO, EVAL_MODEL, XAI_API_KEY, QUESTIONS_PATH, GOLD_PATH, load_jsonl ) # from xai_sdk import Client # from xai_sdk.chat import user, system # # client = Client( # api_key=XAI_API_KEY, # timeout=3600, # Override default timeout with longer timeout for reasoning models # ) if not USE_LM_STUDIO else None client = OpenAI( api_key=XAI_API_KEY, base_url="https://api.x.ai/v1", timeout=httpx.Timeout(3600.0), # Override default timeout with longer timeout for reasoning models ) if not USE_LM_STUDIO else None # chat = client.chat.create(model="grok-4") # chat.append(system("You are a PhD-level mathematician.")) # chat.append(user("What is 2 + 2?")) # # response = chat.sample() # print(response.content) SYSTEM_PROMPT = """You are a strict grader for a RAG QA competition. Return JSON: {"score": 0|1|2}. Rules: - 2: semantically equivalent to gold - 1: partially correct - 0: wrong/empty/irrelevant """ if USE_LM_STUDIO: SYSTEM_PROMPT = """You are a strict grader. Return ONLY a JSON object with key "score" and optional "justification". Example: {"score": 2, "justification": "..."} Scores: 2 = fully correct 1 = partially correct 0 = wrong/empty/irrelevant """ USER_PROMPT_TEMPLATE = """Question: {question} Gold answer: {gold} Participant answer: {pred} """ # client = OpenAI(api_key=OPENAI_API_KEY) if not USE_LM_STUDIO else None async def eval_one(question, gold, pred): pred = (pred or "").strip() if not pred: return 0 prompt = USER_PROMPT_TEMPLATE.format(question=question, gold=gold, pred=pred) payload = { "model": EVAL_MODEL, "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ], "temperature": 0, } import re, json def parse_score(text: str) -> int: # вытащим первый JSON-объект из текста m = re.search(r"\{.*\}", text, re.DOTALL) if not m: return 0 try: obj = json.loads(m.group(0)) s = int(obj.get("score", 0)) return s if s in (0,1,2) else 0 except: return 0 if not USE_LM_STUDIO: payload["response_format"] = {"type": "json_object"} # --- LM Studio mode --- if USE_LM_STUDIO: try: r = requests.post( "http://192.168.68.106:1234/v1/chat/completions", json=payload, timeout=60, ) data = r.json() print(data) msg = data["choices"][0]["message"]["content"] score = parse_score(msg) return score except Exception as e: print('what', e) return 0 # --- OpenAI mode --- try: resp = await asyncio.to_thread( lambda: client.chat.completions.create(**payload) ) msg = resp.choices[0].message.content score = int(json.loads(msg).get("score", 0)) return score if score in (0, 1, 2) else 0 except Exception: return 0 async def _evaluate_all(tasks): return await asyncio.gather(*tasks) def _run_async(coro): """ Надёжно запускает async-код: - если сейчас нет event loop → обычный asyncio.run - если внутри уже работающего loop (Gradio/AnyIO/Jupyter) → запускаем в новом потоке с новым loop """ import threading try: # обычный сценарий (нет активного loop в этом потоке) return asyncio.run(coro) except RuntimeError: # внутри активного event loop → запускаем в отдельном потоке result_container = {} def runner(): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: result_container["res"] = loop.run_until_complete(coro) finally: loop.close() t = threading.Thread(target=runner) t.start() t.join() return result_container["res"] def evaluate_submission(submit_path: str): # submission jsonl: {"id":..., "answer":..., "doc_ids":[...]} per line sub_rows = load_jsonl(submit_path) pred_map = {str(x["id"]): str(x.get("answer", "")) for x in sub_rows} questions = load_jsonl(QUESTIONS_PATH) gold_rows = load_jsonl(GOLD_PATH) gold_map = {str(x["id"]): str(x.get("gold_answer", "")) for x in gold_rows} tasks = [] for q in questions: qid = str(q["id"]) question = q["question"] gold = gold_map.get(qid, "") pred = pred_map.get(qid, "") # print(question, gold, pred) tasks.append(eval_one(question, gold, pred)) scores = _run_async(_evaluate_all(tasks)) zeros = scores.count(0) ones = scores.count(1) twos = scores.count(2) return { "zeros": zeros, "ones": ones, "twos": twos, "n": len(scores), }