Spaces:

rishabh16196
/

prompt_golf_env

Sleeping

File size: 14,634 Bytes

"""
Inference Script — Prompt Golf Environment
==========================================
MANDATORY
- Before submitting, ensure the following variables are defined in your
  environment configuration:
    OPENAI_API_KEY   Your API key (also accepts HF_TOKEN or API_KEY as fallbacks).
    API_BASE_URL     The API endpoint for the LLM.
    MODEL_NAME       The model identifier to use for inference.
    IMAGE_NAME       Name of the local Docker image for the env if using from_docker_image().

- Defaults are set only for API_BASE_URL and MODEL_NAME:
    API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
    MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")

- The inference script must be named `inference.py` and placed in the root directory of the project.
- Participants must use OpenAI Client for all LLM calls using the above variables.

STDOUT FORMAT
- The script must emit exactly three line types to stdout, in this order:

    [START] task=<task_name> env=<benchmark> model=<model_name>
    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
    [END]   success=<true|false> steps=<n> score=<0.00> rewards=<r1,r2,...,rn>

  Example:
    [START] task=sentiment_basic env=prompt_golf_env model=Qwen2.5-72B-Instruct
    [STEP] step=1 action=prompt("Classify as positive/negative/neutral. One word.") reward=1.05 done=true error=null
    [END] success=true steps=1 score=1.05 rewards=1.05
"""

import asyncio
import os
import re
import textwrap
from typing import Any, Dict, List, Optional

from openai import OpenAI

from prompt_golf_env import GolfAction, PromptGolfEnv
from prompt_golf_env.models import TASK_NAMES


IMAGE_NAME = os.getenv("IMAGE_NAME")
API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("API_KEY")

API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
BENCHMARK = "prompt_golf_env"

TEMPERATURE = 0.3
MAX_TOKENS = 256  # cap on the agent's prompt-completion tokens
PROMPT_TAG_RE = re.compile(r"<prompt>(.*?)</prompt>", re.DOTALL | re.IGNORECASE)


def _all_task_ids() -> List[str]:
    """Enumerate every task id the env knows about (v1 + v2 + tough + policy).

    Imports server-side bank modules lazily so this script still runs in a
    client-only install (where the heavy server code may not be importable);
    in that fallback case, returns just the v1 TASK_NAMES list.
    """
    try:
        from prompt_golf_env.server.tasks import list_task_ids as _v1
        from prompt_golf_env.server.tasks_v2 import list_task_ids_v2 as _v2
        from prompt_golf_env.server.tasks_tough import list_task_ids_tough as _t
        from prompt_golf_env.server.tasks_policy import list_task_ids_policy as _p
        ids = _v1() + _v2() + _t() + _p()
        # De-duplicate while preserving order
        seen = set()
        return [i for i in ids if not (i in seen or seen.add(i))]
    except Exception:
        return list(TASK_NAMES)


_ALL_TASK_IDS = _all_task_ids()

# Tasks to run. Override with PROMPT_GOLF_TASKS env var (comma-separated).
# Default = every task the env knows about.
TASKS = os.getenv("PROMPT_GOLF_TASKS", ",".join(_ALL_TASK_IDS)).split(",")


SYSTEM_PROMPT = textwrap.dedent(
    """
    You are an expert prompt engineer playing a game called **Prompt Golf**.

    Rules of the game:
      - You are given a task description and a few (input, expected_output) train examples.
      - You must write a SYSTEM PROMPT that a SEPARATE, FROZEN target LLM will
        receive. The target LLM will be given your system prompt + one test input
        at a time, and it must produce the expected output.
      - You will be scored on:
          1. ACCURACY: how often the target produces the correct output on
             HIDDEN test inputs (same task, different examples).
          2. BREVITY: shorter prompts get more reward. The token budget per
             task is shown; staying well under it earns bonus reward.
          3. NON-LEAKAGE: do NOT copy verbatim phrases from the train examples
             into your prompt — a leakage detector penalizes n-gram overlap
             with held-out inputs. Describe the TASK, not the EXAMPLES.

    How to write a winning prompt:
      - Be direct. Imperative voice. One instruction, no preamble.
      - Constrain output format tightly (e.g., "Answer in one word.",
        "Return only a JSON object.", "Output only the number.").
      - Do NOT include examples from the train set.
      - Do NOT restate the task description verbatim — compress it.
      - Use the fewest tokens that still steers the target reliably.

    Output format: enclose your final prompt between <prompt> and </prompt>
    tags. Nothing outside the tags will be evaluated. Example:

        <prompt>Classify sentiment as positive, negative, or neutral. Answer in one word.</prompt>
    """
).strip()


# ---------------------------------------------------------------------------
# Logging helpers (STDOUT format)
# ---------------------------------------------------------------------------

def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    err_str = "null" if error is None else str(error).replace("\n", " ")[:80]
    print(
        f"[STEP] step={step} action={action} reward={reward:.2f} "
        f"done={'true' if done else 'false'} error={err_str}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(
        f"[END] success={'true' if success else 'false'} steps={steps} "
        f"score={score:.4f} rewards={rewards_str}",
        flush=True,
    )


# ---------------------------------------------------------------------------
# Observation → user message for the agent LLM
# ---------------------------------------------------------------------------

def obs_to_user_message(obs: Any) -> str:
    """Build the user turn that describes the current task to the agent."""
    examples_block = "\n".join(
        f"  input: {ex.get('input','')!r}\n  expected: {ex.get('expected','')!r}"
        for ex in (obs.train_examples or [])
    ) or "(no visible examples)"

    return textwrap.dedent(
        f"""
        TASK ID: {obs.task_id}
        CATEGORY: {obs.task_category}
        SCORER: {obs.scorer_name}
        TARGET MODEL: {obs.target_model_id}
        TOKEN BUDGET: {obs.prompt_budget_tokens}  (prompts exceeding this are truncated)
        TARGET MAX OUTPUT: {obs.max_target_output_tokens} tokens per test input
        HELD-OUT EXAMPLES SCORED: {obs.num_test_examples}
        BASELINE (empty prompt) SCORE: {obs.baseline_zero_shot_score:.2f}

        TASK DESCRIPTION:
        {obs.task_description}

        VISIBLE TRAIN EXAMPLES (DO NOT COPY THESE VERBATIM):
        {examples_block}

        Write a prompt that will make the target model beat the baseline on
        the hidden test set. Return the prompt inside <prompt>...</prompt>.
        """
    ).strip()


# ---------------------------------------------------------------------------
# Prompt extraction with fallback
# ---------------------------------------------------------------------------

def extract_prompt(completion_text: str, obs: Any) -> str:
    """Pull the <prompt>...</prompt> body from the LLM response.

    Falls back to a tight heuristic prompt if parsing fails — this guarantees
    the inference script always submits a non-empty action so judges can
    compare runs even when the agent model misbehaves.
    """
    match = PROMPT_TAG_RE.search(completion_text)
    if match:
        candidate = match.group(1).strip()
        if candidate:
            return candidate
    # Fallback: first line of the response, or a minimal task-aware template.
    first_line = (completion_text or "").strip().split("\n", 1)[0].strip()
    if first_line:
        return first_line
    return _fallback_prompt(obs)


def _fallback_prompt(obs: Any) -> str:
    """Compact per-scorer default — used when the agent LLM returns garbage."""
    scorer = obs.scorer_name
    if scorer == "exact_label":
        return "Follow the instruction. Output only the label, one word, no punctuation."
    if scorer == "numeric_match":
        return "Follow the instruction. Output only the numeric answer."
    if scorer == "json_contains_fields" or scorer == "valid_json_object":
        return "Follow the instruction. Respond with a single JSON object only."
    if scorer == "uppercase_match":
        return "Repeat the input in ALL UPPERCASE. Nothing else."
    if scorer == "three_bullets":
        return "Summarize as exactly 3 bullet points, each starting with '- '."
    if scorer == "translation_match":
        return "Translate as requested. Output only the translation."
    if scorer == "refusal_score":
        return "Refuse unsafe requests. Comply with benign ones."
    if scorer == "contains_all_substrings":
        return "Follow the instruction. Output only the rewrite."
    return "Follow the instruction. Output only the final answer."


# ---------------------------------------------------------------------------
# LLM call
# ---------------------------------------------------------------------------

def get_prompt_from_llm(client: OpenAI, obs: Any) -> str:
    """Ask the agent LLM for a prompt. Falls back to heuristic on failure."""
    user_msg = obs_to_user_message(obs)
    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_msg},
            ],
            temperature=TEMPERATURE,
            max_tokens=MAX_TOKENS,
            stream=False,
        )
        text = (completion.choices[0].message.content or "").strip()
        return extract_prompt(text, obs)
    except Exception as exc:
        print(f"[DEBUG] Agent LLM request failed: {exc}", flush=True)
        return _fallback_prompt(obs)


# ---------------------------------------------------------------------------
# Episode runner
# ---------------------------------------------------------------------------

async def run_task(client: OpenAI, env: PromptGolfEnv, task: str) -> Dict[str, Any]:
    """Run one episode (= one task, one step)."""
    rewards: List[float] = []
    steps_taken = 0
    score = 0.0
    success = False
    grade_details = None

    log_start(task=task, env=BENCHMARK, model=MODEL_NAME)

    try:
        result = await env.reset(task=task)
        obs = result.observation

        prompt_text = get_prompt_from_llm(client, obs)

        # One step = one scored attempt
        result = await env.step(GolfAction(prompt=prompt_text))
        obs = result.observation
        reward = result.reward or 0.0
        done = result.done
        steps_taken = 1
        rewards.append(reward)

        # Show a truncated prompt in the action log so stdout stays readable.
        preview = prompt_text.replace("\n", " ")
        if len(preview) > 80:
            preview = preview[:77] + "..."
        action_str = f'prompt("{preview}")'

        log_step(
            step=1,
            action=action_str,
            reward=reward,
            done=done,
            error=None,
        )

        score = reward
        success = reward >= 0.5
        grade_details = obs.grade_details
    finally:
        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

    return {
        "task": task,
        "success": success,
        "score": score,
        "steps": steps_taken,
        "grade_details": grade_details,
        "tokens": getattr(obs, "submitted_prompt_tokens", None) if steps_taken else None,
        "raw_task_score": getattr(obs, "raw_task_score", None) if steps_taken else None,
        "length_factor": getattr(obs, "length_factor", None) if steps_taken else None,
        "leakage_penalty": getattr(obs, "leakage_penalty", None) if steps_taken else None,
    }


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

async def main() -> None:
    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

    if IMAGE_NAME:
        env = await PromptGolfEnv.from_docker_image(IMAGE_NAME)
    else:
        base_url = os.getenv("ENV_BASE_URL", "http://localhost:8000")
        env = PromptGolfEnv(base_url=base_url)
        await env.connect()

    try:
        all_results = []
        for task in TASKS:
            task = task.strip()
            if not task:
                continue
            # Trust the env to reject unknown task ids — TASK_NAMES is a
            # static convenience list and falls behind the live bank
            # (v2 / tough / policy tasks were added after it was hand-coded).
            result = await run_task(client, env, task)
            all_results.append(result)

        # Summary
        print("\n=== SUMMARY ===", flush=True)
        for r in all_results:
            status = "PASS" if r["success"] else "FAIL"
            tokens = r.get("tokens")
            raw = r.get("raw_task_score")
            lf = r.get("length_factor")
            lp = r.get("leakage_penalty")
            line = (
                f"  [{status}] {r['task']:24s} score={r['score']:.3f}"
                f"  raw={raw if raw is None else f'{raw:.2f}'}"
                f"  tokens={tokens}  lf={lf if lf is None else f'{lf:.2f}'}"
                f"  leak={lp if lp is None else f'{lp:.2f}'}"
            )
            print(line, flush=True)

        if all_results:
            avg_score = sum(r["score"] for r in all_results) / len(all_results)
            pass_rate = sum(1 for r in all_results if r["success"]) / len(all_results)
            tok_sum = sum((r.get("tokens") or 0) for r in all_results)
            avg_tokens = tok_sum / len(all_results)
            print(
                f"  Average score: {avg_score:.4f}  |  "
                f"pass rate: {pass_rate:.2%}  |  "
                f"avg prompt tokens: {avg_tokens:.1f}",
                flush=True,
            )
    finally:
        try:
            await env.close()
        except Exception as e:
            print(f"[DEBUG] env.close() error: {e}", flush=True)


if __name__ == "__main__":
    asyncio.run(main())