Spaces:

varb15
/

dataqa-env

Sleeping

App Files Files Community

dataqa-env / inference.py

varb15

Upload folder using huggingface_hub

64eb355 verified 10 days ago

raw

history blame contribute delete

14.2 kB

	#!/usr/bin/env python3
	"""
	DataQA Inference Script — Two-Phase Agent
	------------------------------------------
	LLM agent that plays the DataQA environment in two phases:
	Phase 1: Identify all data quality issues
	Phase 2: Propose fixes for identified issues

	Uses the OpenAI client to interact with any OpenAI-compatible LLM API.

	Required environment variables:
	API_BASE_URL - LLM API endpoint (e.g., https://router.huggingface.co/v1)
	MODEL_NAME - Model identifier (e.g., Qwen/Qwen2.5-72B-Instruct)
	HF_TOKEN - HuggingFace token / API key

	STDOUT FORMAT (mandatory for evaluation):
	[START] task=<task_name> env=<benchmark> model=<model_name>
	[STEP] step=<n> action=<action_str> reward=<0.00> done=<true\|false> error=<msg\|null>
	[END] success=<true\|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
	"""

	from __future__ import annotations

	import os
	import re
	import sys
	import time
	from typing import List, Optional

	import requests
	from openai import OpenAI

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------
	API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
	MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
	API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
	ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")

	BENCHMARK = "dataqa_env"
	TASKS = ["easy", "medium", "hard", "alignment", "coding", "toolcalling", "moderation"]
	MAX_STEPS_PER_TASK = 3


	# ---------------------------------------------------------------------------
	# Logging helpers (structured stdout — exact format required by evaluation)
	# ---------------------------------------------------------------------------

	def log_start(task: str, env: str, model: str) -> None:
	print(f"[START] task={task} env={env} model={model}", flush=True)


	def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
	error_val = error if error else "null"
	done_val = str(done).lower()
	print(
	f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
	flush=True,
	)


	def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
	rewards_str = ",".join(f"{r:.2f}" for r in rewards)
	print(
	f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
	flush=True,
	)


	# ---------------------------------------------------------------------------
	# Environment HTTP client
	# ---------------------------------------------------------------------------

	class EnvHTTPClient:
	"""Minimal HTTP client for the DataQA environment."""

	def __init__(self, base_url: str):
	self.base_url = base_url.rstrip("/")
	self.session = requests.Session()

	def health(self) -> bool:
	try:
	r = self.session.get(f"{self.base_url}/health", timeout=10)
	return r.status_code == 200
	except Exception:
	return False

	def reset(self, task_id: str = "easy") -> dict:
	r = self.session.post(
	f"{self.base_url}/reset",
	json={"task_id": task_id},
	timeout=30,
	)
	r.raise_for_status()
	return r.json()

	def step(self, issues: list[str], fixes: list[str], task_id: str = "easy") -> dict:
	r = self.session.post(
	f"{self.base_url}/step",
	json={"action": {"issues": issues, "fixes": fixes, "task_id": task_id}},
	timeout=30,
	)
	r.raise_for_status()
	return r.json()


	# ---------------------------------------------------------------------------
	# LLM Prompts
	# ---------------------------------------------------------------------------

	IDENTIFY_SYSTEM_PROMPT = """You are a data quality analyst. Your job is to inspect datasets and identify data quality issues.

	You will be given:
	1. A dataset in CSV format
	2. A schema describing expected column types and constraints
	3. Validation rules that the data should satisfy

	You must identify ALL data quality issues and report each one in EXACTLY this format:
	row:<row_number>,col:<column_name>,issue:<issue_type>

	Supported issue types:
	- missing_value (null, empty, or whitespace-only)
	- wrong_type (value doesn't match expected type)
	- duplicate_row (exact duplicate or duplicate key)
	- out_of_range (value outside valid range)
	- format_violation (wrong format, invalid enum value)
	- inconsistent_value (computed field doesn't match, logical inconsistency)
	- statistical_outlier (value is unreasonable given context)
	- referential_integrity (foreign key violation)

	CRITICAL INSTRUCTIONS FOR ROW NUMBERING:
	- Row numbers refer to the ROW POSITION in the CSV data, NOT the value of any ID column
	- Row 1 = the FIRST data row after the header
	- Row 2 = the SECOND data row after the header
	- DO NOT use the employee_id, order_id, or experiment_id as the row number
	- Column names must match exactly (use the CSV header names, lowercase)
	- Check EVERY row and EVERY column systematically
	- Consider cross-column consistency (e.g., total = quantity * price)
	- Look for subtle issues like whitespace-only values, near-duplicates
	- Report ALL issues you find, even if uncertain

	Respond with ONLY the list of issues, one per line. No other text.
	Example: row:3,col:salary,issue:missing_value"""


	FIX_SYSTEM_PROMPT = """You are a data repair specialist. You have already identified data quality issues in a dataset. Now you must propose the correct values to fix each issue.

	For each issue you identified, propose a fix in EXACTLY this format:
	row:<row_number>,col:<column_name>,fix:<corrected_value>

	Guidelines for proposing fixes:
	- For missing_value: infer the correct value from context, schema, and other rows
	- For wrong_type: convert to the correct type (e.g., "seventy-five thousand" → "75000")
	- For out_of_range: propose a value within the valid range that makes sense in context
	- For format_violation: correct the format (e.g., "26/01/2024" → "2024-01-26")
	- For inconsistent_value: compute the correct value from related fields
	- For duplicate_row: propose a corrected unique key or indicate removal
	- For statistical_outlier: propose a reasonable value given the model/context

	Use the schema, validation rules, and surrounding data to determine the correct fix.
	Respond with ONLY the list of fixes, one per line. No other text.
	Example: row:3,col:salary,fix:75000"""


	def build_user_prompt(observation: dict, include_fixes: bool = False) -> str:
	obs = observation if isinstance(observation, dict) else observation
	parts = []

	if obs.get("task_description"):
	parts.append(f"TASK: {obs['task_description']}")

	parts.append(f"SCHEMA:\n{obs.get('schema_description', '')}")
	parts.append(f"VALIDATION RULES:\n{obs.get('validation_rules', '')}")
	parts.append(f"DATASET:\n{obs.get('dataset_csv', '')}")

	hint = obs.get("num_issues_hint", 0)
	if hint:
	parts.append(f"HINT: There are exactly {hint} issues to find.")

	feedback = obs.get("feedback", "")
	if feedback and "reset" not in feedback.lower():
	parts.append(f"FEEDBACK FROM PREVIOUS ATTEMPT:\n{feedback}")

	if include_fixes:
	parts.append(
	"Now propose fixes for ALL issues. "
	"Use format: row:<N>,col:<name>,fix:<corrected_value>"
	)

	return "\n\n".join(parts)


	def parse_llm_response(response: str) -> list[str]:
	"""Extract issue lines from LLM response."""
	issues = []
	for line in response.strip().split("\n"):
	line = line.strip()
	if not line:
	continue
	line = re.sub(r"^\s[\d]+[.\)]\s", "", line)
	line = re.sub(r"^\s[-]\s*", "", line)
	line = line.strip()
	if "row" in line.lower() and "col" in line.lower():
	match = re.search(
	r"row\s[:=]\s(\d+)\s[,;\s]+col(?:umn)?\s[:=]\s([\w_]+)\s[,;\s]+issue\s[:=]\s([\w_]+)",
	line,
	re.IGNORECASE,
	)
	if match:
	normalized = f"row:{match.group(1)},col:{match.group(2).lower()},issue:{match.group(3).lower()}"
	issues.append(normalized)
	return issues


	def parse_fix_response(response: str) -> list[str]:
	"""Extract fix lines from LLM response."""
	fixes = []
	for line in response.strip().split("\n"):
	line = line.strip()
	if not line:
	continue
	line = re.sub(r"^\s[\d]+[.\)]\s", "", line)
	line = re.sub(r"^\s[-]\s*", "", line)
	line = line.strip()
	if "row" in line.lower() and "fix" in line.lower():
	match = re.search(
	r"row\s[:=]\s(\d+)\s[,;\s]+col(?:umn)?\s[:=]\s([\w_]+)\s[,;\s]+fix\s[:=]\s(.+?)$",
	line,
	re.IGNORECASE,
	)
	if match:
	normalized = f"row:{match.group(1)},col:{match.group(2).lower()},fix:{match.group(3).strip()}"
	fixes.append(normalized)
	return fixes


	def call_llm(client: OpenAI, system_prompt: str, user_prompt: str) -> str:
	"""Call the LLM with retry on rate limit."""
	for attempt in range(3):
	try:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	],
	temperature=0.1,
	max_tokens=2048,
	)
	return response.choices[0].message.content or ""
	except Exception as e:
	if "rate_limit" in str(e).lower() or "429" in str(e):
	wait = 10 * (attempt + 1)
	print(f"[DEBUG] Rate limited, waiting {wait}s...", file=sys.stderr, flush=True)
	time.sleep(wait)
	else:
	print(f"[DEBUG] LLM call failed: {e}", file=sys.stderr, flush=True)
	return ""
	return ""


	def run_task(client: OpenAI, env: EnvHTTPClient, task_id: str) -> float:
	"""
	Run a single task with two-phase strategy:
	Step 1: Identify issues only
	Step 2: Identify + Fix (using feedback from step 1)
	Step 3: Refined identify + fix (if needed)
	"""
	log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

	rewards: List[float] = []
	steps_taken = 0
	best_score = 0.0
	success = False

	try:
	reset_response = env.reset(task_id=task_id)
	observation = reset_response.get("observation", reset_response)

	last_issues: list[str] = []
	last_llm_output = ""

	for step_num in range(1, MAX_STEPS_PER_TASK + 1):
	error_msg = None

	# ── Phase 1: Identify issues ──
	user_prompt = build_user_prompt(observation)
	identify_output = call_llm(client, IDENTIFY_SYSTEM_PROMPT, user_prompt)
	issues = parse_llm_response(identify_output)

	if not issues and not error_msg:
	error_msg = "no issues parsed from LLM response"

	# ── Phase 2: Propose fixes (from step 2 onward, or always if we have issues) ──
	fixes: list[str] = []
	if issues and step_num >= 2:
	# Build a fix prompt that includes the identified issues
	fix_prompt = build_user_prompt(observation, include_fixes=True)
	fix_prompt += f"\n\nISSUES FOUND:\n" + "\n".join(issues)
	fix_output = call_llm(client, FIX_SYSTEM_PROMPT, fix_prompt)
	fixes = parse_fix_response(fix_output)

	# ── Submit to environment ──
	action_str = ";".join(issues[:5]) if issues else "none"
	if fixes:
	action_str += "\|fixes:" + ";".join(fixes[:3])

	step_response = env.step(issues, fixes, task_id=task_id)
	observation = step_response.get("observation", step_response)

	reward = float(step_response.get("reward", 0.0) or 0.0)
	done = bool(step_response.get("done", False))
	best_score = max(best_score, reward)
	rewards.append(reward)
	steps_taken = step_num

	log_step(
	step=step_num,
	action=action_str,
	reward=reward,
	done=done,
	error=error_msg,
	)

	if done:
	break

	last_issues = issues
	last_llm_output = identify_output

	success = best_score >= 0.5

	finally:
	log_end(success=success, steps=steps_taken, score=best_score, rewards=rewards)

	return best_score


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	def main():
	print(f"[DEBUG] DataQA Inference starting", file=sys.stderr, flush=True)
	print(f"[DEBUG] ENV_URL={ENV_URL}", file=sys.stderr, flush=True)
	print(f"[DEBUG] API_BASE_URL={API_BASE_URL}", file=sys.stderr, flush=True)
	print(f"[DEBUG] MODEL_NAME={MODEL_NAME}", file=sys.stderr, flush=True)

	env = EnvHTTPClient(ENV_URL)
	llm_client = OpenAI(
	base_url=API_BASE_URL,
	api_key=API_KEY or "no-key",
	)

	if not env.health():
	print("[DEBUG] Environment is not healthy. Exiting.", file=sys.stderr, flush=True)
	sys.exit(1)

	print(f"[DEBUG] Environment is healthy", file=sys.stderr, flush=True)

	scores = {}
	for task_id in TASKS:
	try:
	score = run_task(llm_client, env, task_id)
	scores[task_id] = score
	except Exception as e:
	print(f"[DEBUG] Task {task_id} failed: {e}", file=sys.stderr, flush=True)
	scores[task_id] = 0.0

	avg_score = sum(scores.values()) / len(scores) if scores else 0.0
	print(f"\n[DEBUG] FINAL RESULTS: {scores} avg={avg_score:.3f}", file=sys.stderr, flush=True)


	if __name__ == "__main__":
	main()