Spaces:

rishabh16196
/

prompt_golf_env

Sleeping

Don Rishabh

inference.py: enumerate all task banks, drop stale TASK_NAMES gate

34b5069 12 days ago

14.6 kB

	"""
	Inference Script — Prompt Golf Environment
	==========================================
	MANDATORY
	- Before submitting, ensure the following variables are defined in your
	environment configuration:
	OPENAI_API_KEY Your API key (also accepts HF_TOKEN or API_KEY as fallbacks).
	API_BASE_URL The API endpoint for the LLM.
	MODEL_NAME The model identifier to use for inference.
	IMAGE_NAME Name of the local Docker image for the env if using from_docker_image().

	- Defaults are set only for API_BASE_URL and MODEL_NAME:
	API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
	MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")

	- The inference script must be named `inference.py` and placed in the root directory of the project.
	- Participants must use OpenAI Client for all LLM calls using the above variables.

	STDOUT FORMAT
	- The script must emit exactly three line types to stdout, in this order:

	[START] task=<task_name> env=<benchmark> model=<model_name>
	[STEP] step=<n> action=<action_str> reward=<0.00> done=<true\|false> error=<msg\|null>
	[END] success=<true\|false> steps=<n> score=<0.00> rewards=<r1,r2,...,rn>

	Example:
	[START] task=sentiment_basic env=prompt_golf_env model=Qwen2.5-72B-Instruct
	[STEP] step=1 action=prompt("Classify as positive/negative/neutral. One word.") reward=1.05 done=true error=null
	[END] success=true steps=1 score=1.05 rewards=1.05
	"""

	import asyncio
	import os
	import re
	import textwrap
	from typing import Any, Dict, List, Optional

	from openai import OpenAI

	from prompt_golf_env import GolfAction, PromptGolfEnv
	from prompt_golf_env.models import TASK_NAMES


	IMAGE_NAME = os.getenv("IMAGE_NAME")
	API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("API_KEY")

	API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
	MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
	BENCHMARK = "prompt_golf_env"

	TEMPERATURE = 0.3
	MAX_TOKENS = 256 # cap on the agent's prompt-completion tokens
	PROMPT_TAG_RE = re.compile(r"<prompt>(.*?)</prompt>", re.DOTALL \| re.IGNORECASE)


	def _all_task_ids() -> List[str]:
	"""Enumerate every task id the env knows about (v1 + v2 + tough + policy).

	Imports server-side bank modules lazily so this script still runs in a
	client-only install (where the heavy server code may not be importable);
	in that fallback case, returns just the v1 TASK_NAMES list.
	"""
	try:
	from prompt_golf_env.server.tasks import list_task_ids as _v1
	from prompt_golf_env.server.tasks_v2 import list_task_ids_v2 as _v2
	from prompt_golf_env.server.tasks_tough import list_task_ids_tough as _t
	from prompt_golf_env.server.tasks_policy import list_task_ids_policy as _p
	ids = _v1() + _v2() + _t() + _p()
	# De-duplicate while preserving order
	seen = set()
	return [i for i in ids if not (i in seen or seen.add(i))]
	except Exception:
	return list(TASK_NAMES)


	_ALL_TASK_IDS = _all_task_ids()

	# Tasks to run. Override with PROMPT_GOLF_TASKS env var (comma-separated).
	# Default = every task the env knows about.
	TASKS = os.getenv("PROMPT_GOLF_TASKS", ",".join(_ALL_TASK_IDS)).split(",")


	SYSTEM_PROMPT = textwrap.dedent(
	"""
	You are an expert prompt engineer playing a game called Prompt Golf.

	Rules of the game:
	- You are given a task description and a few (input, expected_output) train examples.
	- You must write a SYSTEM PROMPT that a SEPARATE, FROZEN target LLM will
	receive. The target LLM will be given your system prompt + one test input
	at a time, and it must produce the expected output.
	- You will be scored on:
	1. ACCURACY: how often the target produces the correct output on
	HIDDEN test inputs (same task, different examples).
	2. BREVITY: shorter prompts get more reward. The token budget per
	task is shown; staying well under it earns bonus reward.
	3. NON-LEAKAGE: do NOT copy verbatim phrases from the train examples
	into your prompt — a leakage detector penalizes n-gram overlap
	with held-out inputs. Describe the TASK, not the EXAMPLES.

	How to write a winning prompt:
	- Be direct. Imperative voice. One instruction, no preamble.
	- Constrain output format tightly (e.g., "Answer in one word.",
	"Return only a JSON object.", "Output only the number.").
	- Do NOT include examples from the train set.
	- Do NOT restate the task description verbatim — compress it.
	- Use the fewest tokens that still steers the target reliably.

	Output format: enclose your final prompt between <prompt> and </prompt>
	tags. Nothing outside the tags will be evaluated. Example:

	<prompt>Classify sentiment as positive, negative, or neutral. Answer in one word.</prompt>
	"""
	).strip()


	# ---------------------------------------------------------------------------
	# Logging helpers (STDOUT format)
	# ---------------------------------------------------------------------------

	def log_start(task: str, env: str, model: str) -> None:
	print(f"[START] task={task} env={env} model={model}", flush=True)


	def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
	err_str = "null" if error is None else str(error).replace("\n", " ")[:80]
	print(
	f"[STEP] step={step} action={action} reward={reward:.2f} "
	f"done={'true' if done else 'false'} error={err_str}",
	flush=True,
	)


	def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
	rewards_str = ",".join(f"{r:.2f}" for r in rewards)
	print(
	f"[END] success={'true' if success else 'false'} steps={steps} "
	f"score={score:.4f} rewards={rewards_str}",
	flush=True,
	)


	# ---------------------------------------------------------------------------
	# Observation → user message for the agent LLM
	# ---------------------------------------------------------------------------

	def obs_to_user_message(obs: Any) -> str:
	"""Build the user turn that describes the current task to the agent."""
	examples_block = "\n".join(
	f" input: {ex.get('input','')!r}\n expected: {ex.get('expected','')!r}"
	for ex in (obs.train_examples or [])
	) or "(no visible examples)"

	return textwrap.dedent(
	f"""
	TASK ID: {obs.task_id}
	CATEGORY: {obs.task_category}
	SCORER: {obs.scorer_name}
	TARGET MODEL: {obs.target_model_id}
	TOKEN BUDGET: {obs.prompt_budget_tokens} (prompts exceeding this are truncated)
	TARGET MAX OUTPUT: {obs.max_target_output_tokens} tokens per test input
	HELD-OUT EXAMPLES SCORED: {obs.num_test_examples}
	BASELINE (empty prompt) SCORE: {obs.baseline_zero_shot_score:.2f}

	TASK DESCRIPTION:
	{obs.task_description}

	VISIBLE TRAIN EXAMPLES (DO NOT COPY THESE VERBATIM):
	{examples_block}

	Write a prompt that will make the target model beat the baseline on
	the hidden test set. Return the prompt inside <prompt>...</prompt>.
	"""
	).strip()


	# ---------------------------------------------------------------------------
	# Prompt extraction with fallback
	# ---------------------------------------------------------------------------

	def extract_prompt(completion_text: str, obs: Any) -> str:
	"""Pull the <prompt>...</prompt> body from the LLM response.

	Falls back to a tight heuristic prompt if parsing fails — this guarantees
	the inference script always submits a non-empty action so judges can
	compare runs even when the agent model misbehaves.
	"""
	match = PROMPT_TAG_RE.search(completion_text)
	if match:
	candidate = match.group(1).strip()
	if candidate:
	return candidate
	# Fallback: first line of the response, or a minimal task-aware template.
	first_line = (completion_text or "").strip().split("\n", 1)[0].strip()
	if first_line:
	return first_line
	return _fallback_prompt(obs)


	def _fallback_prompt(obs: Any) -> str:
	"""Compact per-scorer default — used when the agent LLM returns garbage."""
	scorer = obs.scorer_name
	if scorer == "exact_label":
	return "Follow the instruction. Output only the label, one word, no punctuation."
	if scorer == "numeric_match":
	return "Follow the instruction. Output only the numeric answer."
	if scorer == "json_contains_fields" or scorer == "valid_json_object":
	return "Follow the instruction. Respond with a single JSON object only."
	if scorer == "uppercase_match":
	return "Repeat the input in ALL UPPERCASE. Nothing else."
	if scorer == "three_bullets":
	return "Summarize as exactly 3 bullet points, each starting with '- '."
	if scorer == "translation_match":
	return "Translate as requested. Output only the translation."
	if scorer == "refusal_score":
	return "Refuse unsafe requests. Comply with benign ones."
	if scorer == "contains_all_substrings":
	return "Follow the instruction. Output only the rewrite."
	return "Follow the instruction. Output only the final answer."


	# ---------------------------------------------------------------------------
	# LLM call
	# ---------------------------------------------------------------------------

	def get_prompt_from_llm(client: OpenAI, obs: Any) -> str:
	"""Ask the agent LLM for a prompt. Falls back to heuristic on failure."""
	user_msg = obs_to_user_message(obs)
	try:
	completion = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_msg},
	],
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS,
	stream=False,
	)
	text = (completion.choices[0].message.content or "").strip()
	return extract_prompt(text, obs)
	except Exception as exc:
	print(f"[DEBUG] Agent LLM request failed: {exc}", flush=True)
	return _fallback_prompt(obs)


	# ---------------------------------------------------------------------------
	# Episode runner
	# ---------------------------------------------------------------------------

	async def run_task(client: OpenAI, env: PromptGolfEnv, task: str) -> Dict[str, Any]:
	"""Run one episode (= one task, one step)."""
	rewards: List[float] = []
	steps_taken = 0
	score = 0.0
	success = False
	grade_details = None

	log_start(task=task, env=BENCHMARK, model=MODEL_NAME)

	try:
	result = await env.reset(task=task)
	obs = result.observation

	prompt_text = get_prompt_from_llm(client, obs)

	# One step = one scored attempt
	result = await env.step(GolfAction(prompt=prompt_text))
	obs = result.observation
	reward = result.reward or 0.0
	done = result.done
	steps_taken = 1
	rewards.append(reward)

	# Show a truncated prompt in the action log so stdout stays readable.
	preview = prompt_text.replace("\n", " ")
	if len(preview) > 80:
	preview = preview[:77] + "..."
	action_str = f'prompt("{preview}")'

	log_step(
	step=1,
	action=action_str,
	reward=reward,
	done=done,
	error=None,
	)

	score = reward
	success = reward >= 0.5
	grade_details = obs.grade_details
	finally:
	log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

	return {
	"task": task,
	"success": success,
	"score": score,
	"steps": steps_taken,
	"grade_details": grade_details,
	"tokens": getattr(obs, "submitted_prompt_tokens", None) if steps_taken else None,
	"raw_task_score": getattr(obs, "raw_task_score", None) if steps_taken else None,
	"length_factor": getattr(obs, "length_factor", None) if steps_taken else None,
	"leakage_penalty": getattr(obs, "leakage_penalty", None) if steps_taken else None,
	}


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	async def main() -> None:
	client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

	if IMAGE_NAME:
	env = await PromptGolfEnv.from_docker_image(IMAGE_NAME)
	else:
	base_url = os.getenv("ENV_BASE_URL", "http://localhost:8000")
	env = PromptGolfEnv(base_url=base_url)
	await env.connect()

	try:
	all_results = []
	for task in TASKS:
	task = task.strip()
	if not task:
	continue
	# Trust the env to reject unknown task ids — TASK_NAMES is a
	# static convenience list and falls behind the live bank
	# (v2 / tough / policy tasks were added after it was hand-coded).
	result = await run_task(client, env, task)
	all_results.append(result)

	# Summary
	print("\n=== SUMMARY ===", flush=True)
	for r in all_results:
	status = "PASS" if r["success"] else "FAIL"
	tokens = r.get("tokens")
	raw = r.get("raw_task_score")
	lf = r.get("length_factor")
	lp = r.get("leakage_penalty")
	line = (
	f" [{status}] {r['task']:24s} score={r['score']:.3f}"
	f" raw={raw if raw is None else f'{raw:.2f}'}"
	f" tokens={tokens} lf={lf if lf is None else f'{lf:.2f}'}"
	f" leak={lp if lp is None else f'{lp:.2f}'}"
	)
	print(line, flush=True)

	if all_results:
	avg_score = sum(r["score"] for r in all_results) / len(all_results)
	pass_rate = sum(1 for r in all_results if r["success"]) / len(all_results)
	tok_sum = sum((r.get("tokens") or 0) for r in all_results)
	avg_tokens = tok_sum / len(all_results)
	print(
	f" Average score: {avg_score:.4f} \| "
	f"pass rate: {pass_rate:.2%} \| "
	f"avg prompt tokens: {avg_tokens:.1f}",
	flush=True,
	)
	finally:
	try:
	await env.close()
	except Exception as e:
	print(f"[DEBUG] env.close() error: {e}", flush=True)


	if __name__ == "__main__":
	asyncio.run(main())