debatefloor / inference_debatefloor.py
AniketAsla's picture
sync: mirror git d05fcb5 to Space
b4ac377 verified
"""
inference_debatefloor.py
DebateFloor — Baseline Agent
Runs all 3 tasks against the DebateFloor environment over HTTP.
Declares calibrated confidence (HIGH/MED/LOW) on every terminal action.
MANDATORY STDOUT FORMAT — do not change:
[START] task=<task_id> env=debatefloor model=<model> confidence_required=true
[STEP] step=<n> action=<action_type> reward=<r> confidence=<conf|null> done=<bool> error=<msg|None>
[END] success=<bool> steps=<n> total_reward=<r> calibration_score=<s> decision=<correct|wrong|none>
Usage:
python inference_debatefloor.py --task contradictory_claim --model gpt-4o
python inference_debatefloor.py --all-tasks --seed 42 --base-url http://localhost:7860
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from typing import Any, Dict, List, Optional
import requests
# ─────────────────────────────────────────────────────────────
# CONFIGURATION
# ─────────────────────────────────────────────────────────────
DEFAULT_BASE_URL = "http://localhost:7860"
DEFAULT_MODEL = os.getenv("MODEL_NAME", "gpt-4o")
HF_TOKEN = os.getenv("HF_TOKEN", "")
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
# Task configuration
TASK_CONFIG = {
"clean_claim": {
"terminal_confidence": "HIGH", # obvious approval → HIGH confidence
"strategy": "approve",
},
"contradictory_claim": {
"terminal_confidence": "MED", # fraud detected but some uncertainty → MED
"strategy": "deny",
},
"distribution_shift_claim": {
"terminal_confidence": "MED", # NEW-7: 4 grounded signals + ground_truth_confidence=0.70 → MED
"strategy": "escalate", # canonical decision must be escalate_to_human (env normalises to request_investigation)
},
"coordinated_fraud": {
"terminal_confidence": "MED", # ground_truth_confidence=0.90, ring scope partly unknown → MED
"strategy": "escalate", # canonical decision must be escalate_to_human (env normalises to request_investigation)
},
"identity_fraud": {
"terminal_confidence": "MED", # 4 grounded signals + ground_truth_confidence=0.90 → MED (ID forgery never 100% certain)
"strategy": "deny",
},
}
ALL_TASKS = list(TASK_CONFIG.keys())
# ─────────────────────────────────────────────────────────────
# HTTP CLIENT
# ─────────────────────────────────────────────────────────────
class DebateFloorClient:
def __init__(self, base_url: str = DEFAULT_BASE_URL):
self.base_url = base_url.rstrip("/")
self.session_id: Optional[str] = None
def health(self) -> Dict:
return requests.get(f"{self.base_url}/health", timeout=10).json()
def reset(self, task_id: str, seed: int = 42) -> Dict:
r = requests.post(
f"{self.base_url}/reset",
json={"task_id": task_id, "seed": seed},
timeout=15,
)
r.raise_for_status()
data = r.json()
self.session_id = data.get("session_id")
return data
def step(self, action: Dict[str, Any]) -> Dict:
if not self.session_id:
raise RuntimeError("No active session. Call reset() first.")
r = requests.post(
f"{self.base_url}/step",
json={"action": action, "session_id": self.session_id},
timeout=15,
)
r.raise_for_status()
return r.json()
# ─────────────────────────────────────────────────────────────
# DETERMINISTIC AGENT STRATEGIES
# Each strategy is a scripted sequence of actions. In production
# you'd replace this with LLM completions. This baseline
# demonstrates the confidence declaration mechanic clearly.
# ─────────────────────────────────────────────────────────────
def _strategy_clean_claim(client: DebateFloorClient, obs: Dict) -> List[Dict]:
"""Validate key documents, estimate payout (variant-aware), approve with HIGH.
CF-4 fix: read declared_cost_inr / estimate_inr from the observation so the
payout estimate falls inside the per-variant payout_band. With the previous
hardcoded amount=150000, payout_accuracy was 0 for every variant; reading
the variant value pushes payout_accuracy to 1.0 AND lets the per-variant
band drift be reflected in evidence/reasoning text. See PLAN.md > CF-4.
"""
observation = obs.get("observation", obs)
docs = observation.get("documents", [])
actions = []
for doc in docs[:2]:
actions.append({
"action_type": "validate_document",
"parameters": {"doc_id": doc["doc_id"]},
"reasoning": (
f"Verify document {doc.get('doc_id', '?')} "
f"({doc.get('doc_type', 'unknown')}) before approving."
),
})
declared_cost = None
estimate = None
for doc in docs:
meta = doc.get("metadata", {}) or {}
if declared_cost is None and "declared_cost_inr" in meta:
declared_cost = float(meta["declared_cost_inr"])
if estimate is None and "estimate_inr" in meta:
estimate = float(meta["estimate_inr"])
payout_amount = estimate if estimate is not None else (declared_cost if declared_cost is not None else 50000.0)
actions.append({
"action_type": "estimate_payout",
"parameters": {"amount_inr": payout_amount},
"reasoning": (
f"Use estimate INR {payout_amount:,.0f} read from doc metadata "
f"(declared INR {declared_cost:,.0f})."
if declared_cost is not None
else f"Use estimate INR {payout_amount:,.0f} (no declared cost in docs)."
),
})
approve_reason_parts = ["All documents verified", "no fraud signals"]
if declared_cost is not None:
approve_reason_parts.append(f"declared cost INR {declared_cost:,.0f}")
if estimate is not None and estimate != declared_cost:
approve_reason_parts.append(f"garage estimate INR {estimate:,.0f}")
approve_reason = ". ".join(approve_reason_parts) + ". Clean claim approved."
actions.append({
"action_type": "approve_claim",
"confidence": "HIGH",
"parameters": {"reason": approve_reason},
"reasoning": "Clean claim with consistent variant-specific values — HIGH confidence justified.",
})
return actions
def _strategy_contradictory_claim(client: DebateFloorClient, obs: Dict) -> List[Dict]:
"""Investigate document contradictions, flag signals, deny with MED.
CF-4 fix: cite per-variant incident_date / admission_date / claimed_cost /
standard_rate values from the observation so evidence text reflects what
the variant actually shipped. The flag_id keywords still trigger
get_evidence_keyword_hints(), so signal scoring is preserved.
"""
observation = obs.get("observation", obs)
docs = observation.get("documents", [])
actions = []
for doc in docs[:3]:
actions.append({
"action_type": "validate_document",
"parameters": {"doc_id": doc["doc_id"]},
"reasoning": (
f"Validate {doc.get('doc_id', '?')} "
f"({doc.get('doc_type', 'unknown')}) — looking for cross-doc contradictions."
),
})
actions.append({
"action_type": "query_historical_data",
"parameters": {},
"reasoning": "Check for prior similar claims that could indicate pattern fraud.",
})
incident_date = None
admission_date = None
claimed_cost = None
standard_rate = None
for doc in docs:
meta = doc.get("metadata", {}) or {}
if incident_date is None and "incident_date" in meta:
incident_date = meta["incident_date"]
if admission_date is None and "admission_date" in meta:
admission_date = meta["admission_date"]
if claimed_cost is None and "claimed_cost_inr" in meta:
claimed_cost = meta["claimed_cost_inr"]
if standard_rate is None and "standard_rate_inr" in meta:
standard_rate = meta["standard_rate_inr"]
if incident_date and admission_date:
date_evidence = (
f"Claim form records incident date {incident_date} but hospital "
f"admission documented on {admission_date} — date mismatch confirmed "
"across documents."
)
else:
date_evidence = (
"Claim form incident date does not match hospital admission record — "
"date mismatch confirmed across documents."
)
if claimed_cost is not None and standard_rate is not None and standard_rate:
ratio = float(claimed_cost) / float(standard_rate)
cost_evidence = (
f"Hospital bill INR {claimed_cost:,} is {ratio:.2f}x the regional "
f"standard cost of INR {standard_rate:,} — cost inflation pattern "
"indicating overbilled charges."
)
else:
cost_evidence = (
"Hospital bill rate is approximately 2.4 times the regional standard "
"cost — cost inflation pattern indicating overbilled charges."
)
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {"flag_id": "date_mismatch", "evidence": date_evidence},
"reasoning": "Date inconsistency between claim form and admission record is a grounded fraud indicator.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {"flag_id": "cost_inflation", "evidence": cost_evidence},
"reasoning": "Inflated cost versus benchmark suggests billing fraud.",
})
# Convene debate panel — adversarial review before terminal decision
actions.append({
"action_type": "convene_debate_panel",
"parameters": {},
"reasoning": "Contradictory evidence warrants adversarial review. Panel will pressure-test fraud signals.",
})
# Terminal: deny with MED confidence (evidence found but some uncertainty remains)
actions.append({
"action_type": "deny_claim",
"confidence": "MED",
"parameters": {"reason": "Date mismatch and cost inflation confirmed across documents. Fraud signals grounded in evidence."},
"reasoning": "Sufficient evidence to deny, but complex case warrants MED not HIGH confidence.",
})
return actions
def _strategy_distribution_shift_claim(client: DebateFloorClient, obs: Dict) -> List[Dict]:
"""Distribution-shift ring — uses the NEW-7 discovery hooks added to the
environment so this task can finally earn evidence credit.
Env discovery contract (post NEW-7 fix; see app/environment.py and
app/tasks.py:get_evidence_keyword_hints):
validate_document(DOC-41) → records recent_policy_cluster
validate_document(DOC-42) → records shared_repair_shop_far
query_linked_claim(CLM-DIST-602), then (CLM-DIST-603) → CLM-DIST-604
surfaces; on the 2nd query the shared emergency_contact is detected
across queried claims → records shared_emergency_contact; the broker
check fires for any CLM-DIST-* once 2+ claims have been queried →
records clustered_policy_broker.
near_identical_descriptions has no doc-level discovery hook for this
task (the task's primary docs do not contain the cross-claim
narrative), so we skip flagging it — symmetric to coordinated_fraud
which skips shared_emergency_contact for the same reason.
Result: 4 of 5 expected_signals discovered + flagged with grounded
evidence. evidence_quality = evidence_hits / evidence_total = 4/4 = 1.0.
"""
actions: List[Dict] = []
# 1. Validate the two documents whose signals are auto-recorded
actions.append({
"action_type": "validate_document",
"parameters": {"doc_id": "DOC-41"},
"reasoning": "Validate claim form — surfaces recent_policy_cluster from claim_date metadata.",
})
actions.append({
"action_type": "validate_document",
"parameters": {"doc_id": "DOC-42"},
"reasoning": "Validate garage estimate — exposes FastRepair Hub Whitefield (shared shop).",
})
# 2. Query historical data — confirms the policy purchase cluster context.
actions.append({
"action_type": "query_historical_data",
"parameters": {},
"reasoning": "Pull policy history — corroborates 24-day policy age inside the cluster window.",
})
# 3. Query the two visible linked claims. After the 2nd query the env
# auto-records shared_emergency_contact + clustered_policy_broker
# (NEW-7 hooks) and surfaces the hidden CLM-DIST-604.
for cid in ("CLM-DIST-602", "CLM-DIST-603"):
actions.append({
"action_type": "query_linked_claim",
"parameters": {"claim_id": cid},
"reasoning": f"Query {cid} to expose the cross-claim contact/broker overlap.",
})
# 4. Query the now-surfaced 4th claim — strengthens the broker cluster
# and confirms the shared shop / contact pattern.
actions.append({
"action_type": "query_linked_claim",
"parameters": {"claim_id": "CLM-DIST-604"},
"reasoning": "Query the newly-surfaced fourth claim — confirms full ring scope.",
})
# 5. Flag four of five expected_signals with evidence containing the
# keywords required by app.tasks.get_evidence_keyword_hints
# ("distribution_shift_claim", ...).
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "shared_repair_shop_far",
"evidence": "All linked claims used the same repair shop FastRepair Hub Whitefield — geographic ring indicator.",
},
"reasoning": "Shared distant repair shop is a grounded geographic ring indicator.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "shared_emergency_contact",
"evidence": "All queried claims share the same emergency contact phone +91-9000005555 — coordinated contact ring.",
},
"reasoning": "Shared emergency contact across 3 supposedly unrelated claims is a strong ring indicator.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "recent_policy_cluster",
"evidence": "All four related policies were purchased within a 30 day cluster window before the incident — policy purchase cluster.",
},
"reasoning": "Tight policy purchase cluster is a temporal ring indicator.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "clustered_policy_broker",
"evidence": "All queried claims share the same broker BRK-882 — policy broker cluster confirmed across 4 claims.",
},
"reasoning": "Same broker across 4 supposedly unrelated policies = coordinated issuance.",
})
# 6. Adversarial review before terminal action
actions.append({
"action_type": "convene_debate_panel",
"parameters": {},
"reasoning": "Cross-claim ring of 4 demands adversarial review before recommending investigation.",
})
# 7. Terminal: escalate_to_human MED. ground_truth_confidence=0.70 +
# 4 grounded signals → MED is the calibrated answer (LOW would
# underclaim given the strength of the evidence; HIGH would
# overclaim given the residual uncertainty about the full ring scope).
actions.append({
"action_type": "escalate_to_human",
"confidence": "MED",
"parameters": {"reason": "Ring of 4 linked claims with shared shop/broker/contact/policy cluster. Investigator should confirm full scope."},
"reasoning": "Strong multi-signal evidence; ring may extend beyond 4 claims, so MED not HIGH.",
})
return actions
def _strategy_coordinated_fraud(client: DebateFloorClient, obs: Dict) -> List[Dict]:
"""Coordinated ring — validate primary docs (records 3 signals), query 3 linked
claims (surfaces hidden CLM-GROUP-304, records clustered_policy_broker), flag
4 of 5 expected_signals with grounded evidence, then escalate_to_human MED.
Env discovery contract (see app/environment.py:600-636 and 361-417):
validate_document(DOC-21) → records shared_repair_shop_far
validate_document(DOC-22) → records near_identical_descriptions
validate_document(DOC-23) → records recent_policy_cluster
query_linked_claim(CLM-GROUP-302), then (CLM-GROUP-303) → CLM-GROUP-304 surfaces
query_linked_claim(CLM-GROUP-304) → records clustered_policy_broker
shared_emergency_contact has NO discovery path that auto-records the signal
(only a hint string is returned), so flagging it would trigger the
"raised before discovered" penalty (+0.08 penalty_total). We skip it.
CF-4 fix: read variant-specific distance, template_similarity and
days_since_purchase from doc metadata so flagged evidence cites the actual
per-variant numbers.
"""
observation_cf = obs.get("observation", obs)
docs_cf = observation_cf.get("documents", []) or []
distance_km = None
template_similarity = None
purchase_days = None
for doc in docs_cf:
meta = doc.get("metadata", {}) or {}
if distance_km is None and "distance_km" in meta:
distance_km = meta["distance_km"]
if template_similarity is None and "template_similarity" in meta:
template_similarity = meta["template_similarity"]
if purchase_days is None and "days_since_purchase" in meta:
purchase_days = meta["days_since_purchase"]
actions: List[Dict] = []
# 1. Validate the three primary documents (each reveals one expected signal)
for doc_id in ("DOC-21", "DOC-22", "DOC-23"):
actions.append({
"action_type": "validate_document",
"parameters": {"doc_id": doc_id},
"reasoning": f"Validate {doc_id} to surface the embedded ring indicator.",
})
# 2. Query two known linked claims (surfaces the hidden CLM-GROUP-304)
for cid in ("CLM-GROUP-302", "CLM-GROUP-303"):
actions.append({
"action_type": "query_linked_claim",
"parameters": {"claim_id": cid},
"reasoning": f"Query {cid} to expose cross-claim contact/broker overlap.",
})
# 3. Query the now-surfaced 4th claim — this records clustered_policy_broker
actions.append({
"action_type": "query_linked_claim",
"parameters": {"claim_id": "CLM-GROUP-304"},
"reasoning": "Query the newly-surfaced fourth claim — confirms shared broker BRK-441.",
})
# 4. Flag four of five expected_signals with evidence containing required keywords
# (keywords from app.tasks.get_evidence_keyword_hints("coordinated_fraud", ...))
distance_text = f"{distance_km} km" if distance_km is not None else "340 km"
sim_text = f"{template_similarity:.2f}" if isinstance(template_similarity, (int, float)) else "0.93"
if isinstance(purchase_days, list) and purchase_days:
cluster_text = (
f"All four related policies were purchased within a 30 day cluster "
f"window before the incident (days since purchase: {purchase_days})."
)
else:
cluster_text = (
"All four related policies were purchased within a 30 day cluster window before the incident."
)
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "shared_repair_shop_far",
"evidence": f"Repair shop RapidFix Motors in Kota is {distance_text} from incident site — implausible distance.",
},
"reasoning": "Shared distant repair shop is a geographic ring indicator.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "near_identical_descriptions",
"evidence": f"All linked claims use a near-identical narrative description template (similarity ~{sim_text}).",
},
"reasoning": "Identical narrative templates indicate copy-pasted fraud.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "recent_policy_cluster",
"evidence": cluster_text,
},
"reasoning": "Tight policy purchase cluster is a temporal ring indicator.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "clustered_policy_broker",
"evidence": "All queried claims share the same broker BRK-441 — policy broker cluster confirmed.",
},
"reasoning": "Same broker across 4 supposedly unrelated policies = coordinated issuance.",
})
# 5. Adversarial review before terminal action
actions.append({
"action_type": "convene_debate_panel",
"parameters": {},
"reasoning": "Cross-claim ring of 4 demands adversarial review before recommending investigation.",
})
# 6. Terminal: escalate_to_human MED. Env normalises to request_investigation
# (allowed_final_decisions=['request_investigation']) and the calibration
# grader compares the raw escalate_to_human against ground truth
# escalate_to_human (see app/environment.py:34-41, 441-446).
actions.append({
"action_type": "escalate_to_human",
"confidence": "MED",
"parameters": {"reason": "Ring of 4 linked claims with shared shop/broker/policy cluster. Investigator should confirm full scope."},
"reasoning": "Strong evidence but ring may extend beyond 4 claims — MED is the calibrated answer.",
})
return actions
def _strategy_identity_fraud(client: DebateFloorClient, obs: Dict) -> List[Dict]:
"""Identity fraud — validate documents (records 2 signals), compare DOC-31 vs
DOC-34 (records dob_inconsistency), lookup_policy_history (records
recent_policy_purchase since policy_age_days=5 ≤ 30), flag all 4
expected_signals with grounded evidence, then deny_claim MED.
Env discovery contract (see app/environment.py:228-264, 600-636, app/tasks.py:680-683):
validate_document(DOC-31) → records identity_mismatch
validate_document(DOC-32) → records hospital_no_record
compare_documents(DOC-31, DOC-34) → records dob_inconsistency
lookup_policy_history → records recent_policy_purchase (policy_age_days=5)
CF-4 fix: pull per-variant `days_to_claim` from doc metadata so the
recent_policy_purchase evidence reflects the actual variant value
(5/7/3/8/6 days across the 5 variants).
"""
observation_id = obs.get("observation", obs)
docs_id = observation_id.get("documents", []) or []
actions: List[Dict] = []
# 1. Validate the two documents whose signals are auto-recorded
actions.append({
"action_type": "validate_document",
"parameters": {"doc_id": "DOC-31"},
"reasoning": "Validate primary claim form — exposes ID/registry mismatch.",
})
actions.append({
"action_type": "validate_document",
"parameters": {"doc_id": "DOC-32"},
"reasoning": "Validate hospital record — confirms no patient match.",
})
# 2. Compare DOC-31 vs DOC-34 — env's COMPARE_DOCUMENT_SIGNALS records dob_inconsistency
actions.append({
"action_type": "compare_documents",
"parameters": {"doc_id_a": "DOC-31", "doc_id_b": "DOC-34"},
"reasoning": "Compare claim form vs ID proof — reveals DOB inconsistency.",
})
# 3. Policy history lookup — records recent_policy_purchase (policy_age_days=5 ≤ 30)
actions.append({
"action_type": "lookup_policy_history",
"parameters": {},
"reasoning": "Pull policy history — exposes recent inception inside the 30 day exclusion window.",
})
# 4. Flag all four expected_signals with evidence containing required keywords
# (keywords from app.tasks.get_evidence_keyword_hints("identity_fraud", ...))
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "identity_mismatch",
"evidence": "National identity registry returns no record matching policy holder ID suffix 7821 — registry mismatch.",
},
"reasoning": "Identity registry mismatch is a grounded fraud indicator.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "hospital_no_record",
"evidence": "Hospital admission record has no patient name found for the claimant on file.",
},
"reasoning": "Hospital lookup confirms ghost claimant.",
})
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "dob_inconsistency",
"evidence": "Date of birth on submitted ID (1988-04-15) does not match policy DOB (1986-11-22) — inconsistency mismatch.",
},
"reasoning": "DOB drift across documents is a grounded identity-fraud signal.",
})
days_to_claim = None
for doc in docs_id:
meta = doc.get("metadata", {}) or {}
if "days_to_claim" in meta:
days_to_claim = meta["days_to_claim"]
break
days_text = f"{days_to_claim} days" if days_to_claim is not None else "5 days"
actions.append({
"action_type": "flag_fraud_signal",
"parameters": {
"flag_id": "recent_policy_purchase",
"evidence": (
f"Policy inception was only {days_text} before incident date — "
"well inside the 30 day exclusion window — recent policy purchase."
),
},
"reasoning": "Suspiciously recent policy purchase is a grounded indicator.",
})
# 5. Adversarial review before denial
actions.append({
"action_type": "convene_debate_panel",
"parameters": {},
"reasoning": "Four grounded signals warrant adversarial review before denial.",
})
# 6. Terminal: deny_claim MED. Ground truth is deny_claim
# (see app/environment.py:34-41) and allowed_final_decisions
# includes deny_claim (app/tasks.py:488).
actions.append({
"action_type": "deny_claim",
"confidence": "MED",
"parameters": {"reason": "Identity registry mismatch, hospital no-record, DOB drift, and recent policy inside exclusion window — claim cannot stand."},
"reasoning": "Strong multi-signal evidence; ID forgery is rarely provable to 100%, so MED not HIGH.",
})
return actions
STRATEGIES = {
"clean_claim": _strategy_clean_claim,
"contradictory_claim": _strategy_contradictory_claim,
"distribution_shift_claim": _strategy_distribution_shift_claim,
"coordinated_fraud": _strategy_coordinated_fraud,
"identity_fraud": _strategy_identity_fraud,
}
# ─────────────────────────────────────────────────────────────
# EPISODE RUNNER
# ─────────────────────────────────────────────────────────────
def run_episode(task_id: str, model: str, base_url: str, seed: int) -> Dict[str, Any]:
client = DebateFloorClient(base_url)
# Print mandatory [START] line
print(f"[START] task={task_id} env=debatefloor model={model} confidence_required=true")
# Reset environment
reset_resp = client.reset(task_id=task_id, seed=seed)
obs = reset_resp
# Get scripted actions for this task
strategy_fn = STRATEGIES.get(task_id)
if not strategy_fn:
print(f"[ERROR] No strategy for task '{task_id}'")
return {}
actions = strategy_fn(client, obs)
total_reward = 0.0
calibration_score = None
step_num = 0
last_done = False
final_decision_correct = "none"
for action in actions:
if last_done:
break
step_num += 1
confidence = action.get("confidence", None)
try:
step_resp = client.step(action)
except Exception as e:
print(f"[STEP] step={step_num} action={action['action_type']} reward=0.0 confidence={confidence or 'null'} done=False error={e}")
continue
obs = step_resp
reward = step_resp.get("reward", 0.0)
done = step_resp.get("done", False)
observation = step_resp.get("observation", {})
metadata = observation.get("metadata", {})
error = observation.get("metadata", {}).get("last_action_error")
last_done = done
# Extract calibration score on terminal actions
if done and metadata.get("calibration_score") is not None:
calibration_score = metadata["calibration_score"]
total_reward = reward
# Print mandatory [STEP] line
print(
f"[STEP] step={step_num} action={action['action_type']} "
f"reward={reward:.2f} confidence={confidence or 'null'} "
f"done={done} error={error}"
)
# Determine if decision was correct
if calibration_score is not None:
final_decision_correct = "correct" if calibration_score >= 0.0 else "wrong"
success = last_done and (calibration_score is not None) and (calibration_score >= 0.0)
# Print mandatory [END] line
print(
f"[END] success={success} steps={step_num} total_reward={total_reward:.2f} "
f"calibration_score={calibration_score if calibration_score is not None else 'N/A'} "
f"decision={final_decision_correct}"
)
return {
"task_id": task_id,
"success": success,
"steps": step_num,
"total_reward": total_reward,
"calibration_score": calibration_score,
"decision": final_decision_correct,
}
# ─────────────────────────────────────────────────────────────
# CLI
# ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="DebateFloor baseline agent")
parser.add_argument("--task", choices=ALL_TASKS + ["all"], default="contradictory_claim")
parser.add_argument("--model", default=DEFAULT_MODEL)
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--all-tasks", action="store_true")
args = parser.parse_args()
# Verify server is up
client = DebateFloorClient(args.base_url)
try:
health = client.health()
assert health.get("status") == "healthy"
except Exception as e:
print(f"[ERROR] Server not reachable at {args.base_url}: {e}", file=sys.stderr)
sys.exit(1)
tasks_to_run = ALL_TASKS if (args.all_tasks or args.task == "all") else [args.task]
results = []
for task_id in tasks_to_run:
result = run_episode(task_id, args.model, args.base_url, args.seed)
results.append(result)
if len(tasks_to_run) > 1:
print() # blank line between tasks
if len(results) > 1:
print("\n-- Summary --")
for r in results:
cs = r.get("calibration_score")
print(
f" {r['task_id']}: reward={r['total_reward']:.2f} "
f"calibration={cs if cs is not None else 'N/A'} "
f"decision={r['decision']}"
)
if __name__ == "__main__":
main()