negotiation-openenv / inference.py
MridulNegi2005's picture
Major env overhaul: opponent negotiates naturally, gentler time penalty, relative aggression, simplified agent
3f2a3ab
"""
Inference Script β€” OpenEnv Negotiation Environment
Runs LLM agent against all 3 tasks, produces structured logs.
Uses OpenAI-compatible client with HuggingFace router.
STDOUT format (strict β€” parsed by automated judges):
[START] task=<name> env=<benchmark> model=<model>
[STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
All other output goes to stderr.
"""
import os
import re
import sys
from openai import OpenAI
from env_wrapper import EnvWrapper
from tasks import ALL_TASKS, get_grader
def parse_action(llm_text: str):
"""Parse LLM output into (action_str, action_price)."""
match = re.search(r'(OFFER\s+\d+|ACCEPT|REJECT)', llm_text, re.IGNORECASE)
if match:
action = match.group(1).upper()
if action.startswith("OFFER"):
parts = action.split()
try:
price = int(parts[1])
return f"OFFER {price}", price, None
except (IndexError, ValueError):
return "REJECT", 0, "invalid price in OFFER"
return action, 0, None
return None, 0, "no action match"
def run_task(client, model_name: str, task_config):
"""
Run a single task: LLM negotiates against the environment.
Returns: (rewards, steps, deal_made, score_info)
"""
env = EnvWrapper(
opp_type=task_config.opp_type,
a_val=task_config.agent_value,
o_val=task_config.opponent_value,
agent_role=task_config.agent_role,
max_rounds=task_config.max_rounds,
)
obs = env.reset()
print(f"[START] task={task_config.name} env=negotiation model={model_name}", flush=True)
done = False
step_n = 0
rewards = []
deal_made = False
history_for_prompt = []
last_agent_offer = None
try:
while not done and step_n < env.max_rounds:
step_n += 1
# ── Build prompt with history ──
history_text = ""
if history_for_prompt:
history_lines = []
for h in history_for_prompt[-5:]: # Last 5 rounds for context
history_lines.append(f" Round {h['round']}: You β†’ {h['agent']}, Opponent β†’ {h['opp']}")
history_text = "Negotiation history:\n" + "\n".join(history_lines) + "\n\n"
target_goal = "buy for as low as possible (below your maximum value)" if obs.role == "buyer" else "sell for as high as possible (above your minimum value)"
prompt = f"""You are an expert negotiator acting as a {obs.role}. Your objective is to {target_goal} and maximize your profit.
CURRENT STATE:
* Your PRIVATE Valuation: {obs.agent_value} (your absolute limit β€” NEVER go past this)
* Current offer on the table: {obs.current_offer}
* Round: {step_n} of {obs.max_rounds}
* Opponent's last action: {obs.last_opponent_action}
* Opponent's last offer: {obs.last_opponent_offer}
{history_text}STRATEGY:
- Start your first offer at about 40-50% of the opening price. {"As a buyer with valuation " + str(obs.agent_value) + ", aim to pay as LITTLE as possible β€” profit = valuation minus price." if obs.role == "buyer" else "As a seller with valuation " + str(obs.agent_value) + ", aim to sell as HIGH as possible β€” profit = price minus valuation."}
- Concede slowly each round (50-80 per round), watching the opponent move toward you.
- If the opponent's counter is {"below" if obs.role == "buyer" else "above"} {obs.agent_value}, ACCEPT it β€” that's guaranteed profit!
- Close within 3-5 rounds for best time bonus.
- NEVER REJECT β€” rejection = -50 penalty.
HARD RULE: {"Your offer must be BELOW " + str(obs.agent_value) + ". Offering above it loses you money." if obs.role == "buyer" else "Your offer must be ABOVE " + str(obs.agent_value) + ". Offering below it loses you money."}
Choose ONE action:
* OFFER <price>
* ACCEPT
* REJECT
Respond with ONLY your action. Example: OFFER 450"""
action_str = "REJECT"
action_price = 0
error_msg = "null"
try:
response = client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": prompt}],
max_tokens=20,
temperature=0.3,
)
llm_text = response.choices[0].message.content.strip()
parsed_action, parsed_price, parse_err = parse_action(llm_text)
if parsed_action:
action_str = parsed_action
action_price = parsed_price
else:
# Retry with stricter prompt
error_msg = f"parse failed: {parse_err}, retrying"
retry_response = client.chat.completions.create(
model=model_name,
messages=[
{"role": "user", "content": prompt},
{"role": "assistant", "content": llm_text},
{"role": "user", "content": "Output strictly ONLY ONE of: 'OFFER <price>', 'ACCEPT', or 'REJECT'. Nothing else."},
],
max_tokens=15,
temperature=0.1,
)
llm_text2 = retry_response.choices[0].message.content.strip()
parsed2, price2, err2 = parse_action(llm_text2)
if parsed2:
action_str = parsed2
action_price = price2
error_msg = "null"
else:
action_str = "REJECT"
action_price = 0
error_msg = "parse error on retry, defaulting to REJECT"
except Exception as e:
error_msg = f"API_Error: {str(e)[:50]}"
action_str = "REJECT"
action_price = 0
# ── Safety guardrails ──
# ACCEPT guard: never accept a deal worse than our valuation
if action_str == "ACCEPT":
opp_offer = obs.last_opponent_offer
if obs.role == "buyer" and opp_offer > obs.agent_value:
action_str = "OFFER"
action_price = last_agent_offer + 50 if last_agent_offer else int(obs.agent_value * 0.6)
elif obs.role == "seller" and opp_offer < obs.agent_value:
action_str = "OFFER"
action_price = last_agent_offer - 50 if last_agent_offer else int(obs.agent_value * 1.4)
# Valuation clamp: never offer past our own limit
if action_str.startswith("OFFER") and action_price > 0:
if obs.role == "buyer":
action_price = min(action_price, obs.agent_value - 10)
else:
action_price = max(action_price, obs.agent_value + 10)
# Concession cap: max 120 per round to prevent panic jumps
if last_agent_offer is not None:
if obs.role == "buyer":
action_price = min(action_price, last_agent_offer + 120)
else:
action_price = max(action_price, last_agent_offer - 120)
action_str = f"OFFER {action_price}"
last_agent_offer = action_price
# ── Step the environment ──
obs, reward, done, info = env.step(action_str, action_price)
rewards.append(reward)
# Track deal
if done and info.get("deal_type") in ("agent_accepted", "opponent_accepted"):
deal_made = True
# Track history for prompting
history_for_prompt.append({
"round": step_n,
"agent": action_str,
"opp": f"{obs.last_opponent_action} {obs.last_opponent_offer}" if obs.last_opponent_action == "OFFER" else obs.last_opponent_action,
})
# ── Log step (stdout β€” parsed by judges) ──
log_action = action_str if not action_str.startswith("OFFER") else f"OFFER {action_price}"
print(f"[STEP] step={step_n} action={log_action} reward={reward:.2f} done={str(done).lower()} error={error_msg}", flush=True)
finally:
# [END] MUST always be printed, even on exceptions
grader = get_grader(task_config)
result = grader.grade(rewards, step_n, deal_made)
rewards_str = ",".join([f"{r:.2f}" for r in rewards])
score = result['score']
print(f"[END] success={str(result['success']).lower()} steps={step_n} score={score:.4f} rewards={rewards_str}", flush=True)
return result
def main():
api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
model_name = os.getenv("MODEL_NAME", "meta-llama/Meta-Llama-3-8B-Instruct")
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
print("ERROR: HF_TOKEN environment variable is not set.", file=sys.stderr)
print("Set it with: export HF_TOKEN='your_token_here'", file=sys.stderr)
sys.exit(1)
client = OpenAI(base_url=api_base_url, api_key=hf_token)
# Debug info goes to stderr only
print("=" * 60, file=sys.stderr)
print("NEGOTIATION ENVIRONMENT β€” OpenEnv Inference", file=sys.stderr)
print(f"Model: {model_name}", file=sys.stderr)
print(f"API: {api_base_url}", file=sys.stderr)
print("=" * 60, file=sys.stderr)
all_results = []
for task in ALL_TASKS:
result = run_task(client, model_name, task)
all_results.append(result)
# ── Summary to stderr (not parsed) ──
print("\n" + "=" * 60, file=sys.stderr)
print("SUMMARY", file=sys.stderr)
print("=" * 60, file=sys.stderr)
for r in all_results:
status = "PASS" if r["success"] else "FAIL"
print(f" [{status}] {r['task']} ({r['difficulty']}): score={r['score']:.4f} "
f"steps={r['steps']} deal={r['deal_made']} threshold={r['threshold']}",
file=sys.stderr)
avg_score = sum(r["score"] for r in all_results) / len(all_results)
print(f"\n Average Score: {avg_score:.4f}", file=sys.stderr)
print("=" * 60, file=sys.stderr)
if __name__ == "__main__":
main()