Spaces:
Sleeping
Sleeping
| # src/submission/submit.py | |
| import json | |
| import asyncio | |
| import requests | |
| from openai import OpenAI | |
| import httpx | |
| from src.envs import ( | |
| USE_LM_STUDIO, EVAL_MODEL, XAI_API_KEY, | |
| QUESTIONS_PATH, GOLD_PATH, load_jsonl | |
| ) | |
| # from xai_sdk import Client | |
| # from xai_sdk.chat import user, system | |
| # | |
| # client = Client( | |
| # api_key=XAI_API_KEY, | |
| # timeout=3600, # Override default timeout with longer timeout for reasoning models | |
| # ) if not USE_LM_STUDIO else None | |
| client = OpenAI( | |
| api_key=XAI_API_KEY, | |
| base_url="https://api.x.ai/v1", | |
| timeout=httpx.Timeout(3600.0), # Override default timeout with longer timeout for reasoning models | |
| ) if not USE_LM_STUDIO else None | |
| # chat = client.chat.create(model="grok-4") | |
| # chat.append(system("You are a PhD-level mathematician.")) | |
| # chat.append(user("What is 2 + 2?")) | |
| # | |
| # response = chat.sample() | |
| # print(response.content) | |
| SYSTEM_PROMPT = """You are a strict grader for a RAG QA competition. | |
| Return JSON: {"score": 0|1|2}. | |
| Rules: | |
| - 2: semantically equivalent to gold | |
| - 1: partially correct | |
| - 0: wrong/empty/irrelevant | |
| """ | |
| if USE_LM_STUDIO: | |
| SYSTEM_PROMPT = """You are a strict grader. | |
| Return ONLY a JSON object with key "score" and optional "justification". | |
| Example: {"score": 2, "justification": "..."} | |
| Scores: | |
| 2 = fully correct | |
| 1 = partially correct | |
| 0 = wrong/empty/irrelevant | |
| """ | |
| USER_PROMPT_TEMPLATE = """Question: | |
| {question} | |
| Gold answer: | |
| {gold} | |
| Participant answer: | |
| {pred} | |
| """ | |
| # client = OpenAI(api_key=OPENAI_API_KEY) if not USE_LM_STUDIO else None | |
| async def eval_one(question, gold, pred): | |
| pred = (pred or "").strip() | |
| if not pred: | |
| return 0 | |
| prompt = USER_PROMPT_TEMPLATE.format(question=question, gold=gold, pred=pred) | |
| payload = { | |
| "model": EVAL_MODEL, | |
| "messages": [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| "temperature": 0, | |
| } | |
| import re, json | |
| def parse_score(text: str) -> int: | |
| # вытащим первый JSON-объект из текста | |
| m = re.search(r"\{.*\}", text, re.DOTALL) | |
| if not m: | |
| return 0 | |
| try: | |
| obj = json.loads(m.group(0)) | |
| s = int(obj.get("score", 0)) | |
| return s if s in (0,1,2) else 0 | |
| except: | |
| return 0 | |
| if not USE_LM_STUDIO: | |
| payload["response_format"] = {"type": "json_object"} | |
| # --- LM Studio mode --- | |
| if USE_LM_STUDIO: | |
| try: | |
| r = requests.post( | |
| "http://192.168.68.106:1234/v1/chat/completions", | |
| json=payload, | |
| timeout=60, | |
| ) | |
| data = r.json() | |
| print(data) | |
| msg = data["choices"][0]["message"]["content"] | |
| score = parse_score(msg) | |
| return score | |
| except Exception as e: | |
| print('what', e) | |
| return 0 | |
| # --- OpenAI mode --- | |
| try: | |
| resp = await asyncio.to_thread( | |
| lambda: client.chat.completions.create(**payload) | |
| ) | |
| msg = resp.choices[0].message.content | |
| score = int(json.loads(msg).get("score", 0)) | |
| return score if score in (0, 1, 2) else 0 | |
| except Exception: | |
| return 0 | |
| async def _evaluate_all(tasks): | |
| return await asyncio.gather(*tasks) | |
| def _run_async(coro): | |
| """ | |
| Надёжно запускает async-код: | |
| - если сейчас нет event loop → обычный asyncio.run | |
| - если внутри уже работающего loop (Gradio/AnyIO/Jupyter) → запускаем в новом потоке с новым loop | |
| """ | |
| import threading | |
| try: | |
| # обычный сценарий (нет активного loop в этом потоке) | |
| return asyncio.run(coro) | |
| except RuntimeError: | |
| # внутри активного event loop → запускаем в отдельном потоке | |
| result_container = {} | |
| def runner(): | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| try: | |
| result_container["res"] = loop.run_until_complete(coro) | |
| finally: | |
| loop.close() | |
| t = threading.Thread(target=runner) | |
| t.start() | |
| t.join() | |
| return result_container["res"] | |
| def evaluate_submission(submit_path: str): | |
| # submission jsonl: {"id":..., "answer":..., "doc_ids":[...]} per line | |
| sub_rows = load_jsonl(submit_path) | |
| pred_map = {str(x["id"]): str(x.get("answer", "")) for x in sub_rows} | |
| questions = load_jsonl(QUESTIONS_PATH) | |
| gold_rows = load_jsonl(GOLD_PATH) | |
| gold_map = {str(x["id"]): str(x.get("gold_answer", "")) for x in gold_rows} | |
| tasks = [] | |
| for q in questions: | |
| qid = str(q["id"]) | |
| question = q["question"] | |
| gold = gold_map.get(qid, "") | |
| pred = pred_map.get(qid, "") | |
| # print(question, gold, pred) | |
| tasks.append(eval_one(question, gold, pred)) | |
| scores = _run_async(_evaluate_all(tasks)) | |
| zeros = scores.count(0) | |
| ones = scores.count(1) | |
| twos = scores.count(2) | |
| return { | |
| "zeros": zeros, | |
| "ones": ones, | |
| "twos": twos, | |
| "n": len(scores), | |
| } | |