RAG-LB / src /submission /submit.py
datakomarov's picture
Local version to remote
89c433f
# src/submission/submit.py
import json
import asyncio
import requests
from openai import OpenAI
import httpx
from src.envs import (
USE_LM_STUDIO, EVAL_MODEL, XAI_API_KEY,
QUESTIONS_PATH, GOLD_PATH, load_jsonl
)
# from xai_sdk import Client
# from xai_sdk.chat import user, system
#
# client = Client(
# api_key=XAI_API_KEY,
# timeout=3600, # Override default timeout with longer timeout for reasoning models
# ) if not USE_LM_STUDIO else None
client = OpenAI(
api_key=XAI_API_KEY,
base_url="https://api.x.ai/v1",
timeout=httpx.Timeout(3600.0), # Override default timeout with longer timeout for reasoning models
) if not USE_LM_STUDIO else None
# chat = client.chat.create(model="grok-4")
# chat.append(system("You are a PhD-level mathematician."))
# chat.append(user("What is 2 + 2?"))
#
# response = chat.sample()
# print(response.content)
SYSTEM_PROMPT = """You are a strict grader for a RAG QA competition.
Return JSON: {"score": 0|1|2}.
Rules:
- 2: semantically equivalent to gold
- 1: partially correct
- 0: wrong/empty/irrelevant
"""
if USE_LM_STUDIO:
SYSTEM_PROMPT = """You are a strict grader.
Return ONLY a JSON object with key "score" and optional "justification".
Example: {"score": 2, "justification": "..."}
Scores:
2 = fully correct
1 = partially correct
0 = wrong/empty/irrelevant
"""
USER_PROMPT_TEMPLATE = """Question:
{question}
Gold answer:
{gold}
Participant answer:
{pred}
"""
# client = OpenAI(api_key=OPENAI_API_KEY) if not USE_LM_STUDIO else None
async def eval_one(question, gold, pred):
pred = (pred or "").strip()
if not pred:
return 0
prompt = USER_PROMPT_TEMPLATE.format(question=question, gold=gold, pred=pred)
payload = {
"model": EVAL_MODEL,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
"temperature": 0,
}
import re, json
def parse_score(text: str) -> int:
# вытащим первый JSON-объект из текста
m = re.search(r"\{.*\}", text, re.DOTALL)
if not m:
return 0
try:
obj = json.loads(m.group(0))
s = int(obj.get("score", 0))
return s if s in (0,1,2) else 0
except:
return 0
if not USE_LM_STUDIO:
payload["response_format"] = {"type": "json_object"}
# --- LM Studio mode ---
if USE_LM_STUDIO:
try:
r = requests.post(
"http://192.168.68.106:1234/v1/chat/completions",
json=payload,
timeout=60,
)
data = r.json()
print(data)
msg = data["choices"][0]["message"]["content"]
score = parse_score(msg)
return score
except Exception as e:
print('what', e)
return 0
# --- OpenAI mode ---
try:
resp = await asyncio.to_thread(
lambda: client.chat.completions.create(**payload)
)
msg = resp.choices[0].message.content
score = int(json.loads(msg).get("score", 0))
return score if score in (0, 1, 2) else 0
except Exception:
return 0
async def _evaluate_all(tasks):
return await asyncio.gather(*tasks)
def _run_async(coro):
"""
Надёжно запускает async-код:
- если сейчас нет event loop → обычный asyncio.run
- если внутри уже работающего loop (Gradio/AnyIO/Jupyter) → запускаем в новом потоке с новым loop
"""
import threading
try:
# обычный сценарий (нет активного loop в этом потоке)
return asyncio.run(coro)
except RuntimeError:
# внутри активного event loop → запускаем в отдельном потоке
result_container = {}
def runner():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result_container["res"] = loop.run_until_complete(coro)
finally:
loop.close()
t = threading.Thread(target=runner)
t.start()
t.join()
return result_container["res"]
def evaluate_submission(submit_path: str):
# submission jsonl: {"id":..., "answer":..., "doc_ids":[...]} per line
sub_rows = load_jsonl(submit_path)
pred_map = {str(x["id"]): str(x.get("answer", "")) for x in sub_rows}
questions = load_jsonl(QUESTIONS_PATH)
gold_rows = load_jsonl(GOLD_PATH)
gold_map = {str(x["id"]): str(x.get("gold_answer", "")) for x in gold_rows}
tasks = []
for q in questions:
qid = str(q["id"])
question = q["question"]
gold = gold_map.get(qid, "")
pred = pred_map.get(qid, "")
# print(question, gold, pred)
tasks.append(eval_one(question, gold, pred))
scores = _run_async(_evaluate_all(tasks))
zeros = scores.count(0)
ones = scores.count(1)
twos = scores.count(2)
return {
"zeros": zeros,
"ones": ones,
"twos": twos,
"n": len(scores),
}