Spaces:
Sleeping
Sleeping
Commit Β·
3f2a3ab
1
Parent(s): a5c1817
Major env overhaul: opponent negotiates naturally, gentler time penalty, relative aggression, simplified agent
Browse files- __pycache__/env_wrapper.cpython-312.pyc +0 -0
- env_wrapper.py +16 -7
- inference.py +21 -45
__pycache__/env_wrapper.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/env_wrapper.cpython-312.pyc and b/__pycache__/env_wrapper.cpython-312.pyc differ
|
|
|
env_wrapper.py
CHANGED
|
@@ -81,10 +81,15 @@ class Opponent:
|
|
| 81 |
return "REJECT", 0
|
| 82 |
|
| 83 |
# ββ Acceptance Check ββ
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
self.history.append({"round": round_num, "action": "ACCEPT", "price": agent_offer})
|
| 89 |
return "ACCEPT", agent_offer
|
| 90 |
|
|
@@ -192,8 +197,9 @@ class EnvWrapper:
|
|
| 192 |
else:
|
| 193 |
profit = self.agent_value - deal_price
|
| 194 |
|
| 195 |
-
#
|
| 196 |
-
|
|
|
|
| 197 |
base_reward = profit * time_factor
|
| 198 |
|
| 199 |
# Penalty for bad deals (agent accepts a losing deal)
|
|
@@ -268,7 +274,10 @@ class EnvWrapper:
|
|
| 268 |
action_str = f"OFFER {action_price}"
|
| 269 |
|
| 270 |
# ββ CUMULATIVE AGGRESSION PENALTY ββ
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
| 272 |
self.cumulative_aggression_penalty += 2.0
|
| 273 |
|
| 274 |
# Record this step in history
|
|
|
|
| 81 |
return "REJECT", 0
|
| 82 |
|
| 83 |
# ββ Acceptance Check ββ
|
| 84 |
+
# Opponent negotiates for a minimum number of rounds before accepting.
|
| 85 |
+
# Greedy opponents hold out longer; impatient ones settle sooner.
|
| 86 |
+
min_round_to_accept = max(2, self.patience // 3)
|
| 87 |
+
|
| 88 |
+
offer_acceptable = (
|
| 89 |
+
(self.opponent_role == "seller" and agent_offer >= self.opponent_value) or
|
| 90 |
+
(self.opponent_role == "buyer" and agent_offer <= self.opponent_value)
|
| 91 |
+
)
|
| 92 |
+
if offer_acceptable and round_num >= min_round_to_accept:
|
| 93 |
self.history.append({"round": round_num, "action": "ACCEPT", "price": agent_offer})
|
| 94 |
return "ACCEPT", agent_offer
|
| 95 |
|
|
|
|
| 197 |
else:
|
| 198 |
profit = self.agent_value - deal_price
|
| 199 |
|
| 200 |
+
# Gentle time decay: linear, max 50% loss even if all rounds used.
|
| 201 |
+
# This rewards fast deals but doesn't destroy multi-round negotiation.
|
| 202 |
+
time_factor = 1.0 - 0.5 * (self.round / self.max_rounds)
|
| 203 |
base_reward = profit * time_factor
|
| 204 |
|
| 205 |
# Penalty for bad deals (agent accepts a losing deal)
|
|
|
|
| 274 |
action_str = f"OFFER {action_price}"
|
| 275 |
|
| 276 |
# ββ CUMULATIVE AGGRESSION PENALTY ββ
|
| 277 |
+
# Scale threshold to ZOPA width so narrow-ZOPA tasks aren't unfairly punished
|
| 278 |
+
zopa = abs(self.agent_value - self.opponent_value)
|
| 279 |
+
aggression_threshold = max(100, int(zopa * 1.25))
|
| 280 |
+
if abs(action_price - self.opponent_value) > aggression_threshold:
|
| 281 |
self.cumulative_aggression_penalty += 2.0
|
| 282 |
|
| 283 |
# Record this step in history
|
inference.py
CHANGED
|
@@ -72,7 +72,7 @@ def run_task(client, model_name: str, task_config):
|
|
| 72 |
|
| 73 |
target_goal = "buy for as low as possible (below your maximum value)" if obs.role == "buyer" else "sell for as high as possible (above your minimum value)"
|
| 74 |
|
| 75 |
-
prompt = f"""You are an expert negotiator acting as a {obs.role}. Your objective is to {target_goal} and maximize your profit
|
| 76 |
|
| 77 |
CURRENT STATE:
|
| 78 |
* Your PRIVATE Valuation: {obs.agent_value} (your absolute limit β NEVER go past this)
|
|
@@ -81,28 +81,21 @@ CURRENT STATE:
|
|
| 81 |
* Opponent's last action: {obs.last_opponent_action}
|
| 82 |
* Opponent's last offer: {obs.last_opponent_offer}
|
| 83 |
|
| 84 |
-
{history_text}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
-
{("- Round 1: You only have " + str(obs.max_rounds) + " rounds! Start at about 60-65% of your own valuation (" + str(obs.agent_value) + ") as your first offer. Then increase quickly by 100+ each round.") if obs.max_rounds <= 8 else ("- Round 1: Start AGGRESSIVE. Offer around 30-35% of the opponent's opening price. This anchors the negotiation in your favor." if obs.role == "buyer" else "- Round 1: Start AGGRESSIVE. Offer around 2-3x your minimum value. This anchors the negotiation in your favor.")}
|
| 88 |
-
- Round 2-3: Concede moderately. {"Increase" if obs.role == "buyer" else "Decrease"} your offer to find their breaking point.
|
| 89 |
-
- Round 3-4: If the opponent's counter-offer is profitable for you ({"below" if obs.role == "buyer" else "above"} your valuation), ACCEPT it. Otherwise make one final offer near the midpoint.
|
| 90 |
-
- Round 5+: You are running out of time. ACCEPT any profitable deal immediately.
|
| 91 |
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
-
|
| 95 |
-
1. PROFIT MATTERS MOST: Your score = (your profit) Γ (time bonus). A great deal on round 3 beats a mediocre deal on round 1.
|
| 96 |
-
2. TIME BONUS: Decreases each round. Don't drag past round 5.
|
| 97 |
-
3. AGGRESSION PENALTY: Offers extremely far from reasonable (e.g., offering 100 when market is 500+) are penalized. Stay within a plausible range.
|
| 98 |
-
4. NEVER REJECT β a bad deal is almost always better than no deal (rejection = -50 penalty).
|
| 99 |
-
|
| 100 |
-
Choose exactly ONE action:
|
| 101 |
-
* OFFER <price> β counter-offer ({"must be below " + str(obs.agent_value) if obs.role == "buyer" else "must be above " + str(obs.agent_value)})
|
| 102 |
-
* ACCEPT β accept if the opponent's offer gives you good profit
|
| 103 |
-
* REJECT β walk away (almost never do this)
|
| 104 |
-
|
| 105 |
-
Respond with ONLY your action. Example: OFFER 350"""
|
| 106 |
|
| 107 |
action_str = "REJECT"
|
| 108 |
action_price = 0
|
|
@@ -151,50 +144,33 @@ Respond with ONLY your action. Example: OFFER 350"""
|
|
| 151 |
action_str = "REJECT"
|
| 152 |
action_price = 0
|
| 153 |
|
| 154 |
-
# ββ
|
|
|
|
| 155 |
if action_str == "ACCEPT":
|
| 156 |
opp_offer = obs.last_opponent_offer
|
| 157 |
if obs.role == "buyer" and opp_offer > obs.agent_value:
|
| 158 |
-
# Opponent wants more than our max β counter instead
|
| 159 |
action_str = "OFFER"
|
| 160 |
-
action_price = last_agent_offer +
|
| 161 |
elif obs.role == "seller" and opp_offer < obs.agent_value:
|
| 162 |
action_str = "OFFER"
|
| 163 |
-
action_price = last_agent_offer -
|
| 164 |
|
| 165 |
-
#
|
| 166 |
if action_str.startswith("OFFER") and action_price > 0:
|
| 167 |
-
# Hard limit: never cross own valuation
|
| 168 |
if obs.role == "buyer":
|
| 169 |
action_price = min(action_price, obs.agent_value - 10)
|
| 170 |
else:
|
| 171 |
action_price = max(action_price, obs.agent_value + 10)
|
| 172 |
|
| 173 |
-
#
|
| 174 |
-
max_step = max(80, 1200 // env.max_rounds)
|
| 175 |
if last_agent_offer is not None:
|
| 176 |
if obs.role == "buyer":
|
| 177 |
-
action_price = min(action_price, last_agent_offer +
|
| 178 |
else:
|
| 179 |
-
action_price = max(action_price, last_agent_offer -
|
| 180 |
-
|
| 181 |
-
# Target price: aim for ~40% of the gap from agent's value
|
| 182 |
-
# This is where the best profit-vs-time tradeoff lives
|
| 183 |
-
gap = abs(obs.agent_value - obs.current_offer)
|
| 184 |
-
if obs.role == "buyer":
|
| 185 |
-
target = obs.agent_value - int(gap * 0.4) # aim to buy well below value
|
| 186 |
-
# Don't let round 2+ offers go above target unless desperate (round 4+)
|
| 187 |
-
if step_n <= 3 and last_agent_offer is not None:
|
| 188 |
-
action_price = min(action_price, max(target, last_agent_offer + 50))
|
| 189 |
-
else:
|
| 190 |
-
target = obs.agent_value + int(gap * 0.4)
|
| 191 |
-
if step_n <= 3 and last_agent_offer is not None:
|
| 192 |
-
action_price = max(action_price, min(target, last_agent_offer - 50))
|
| 193 |
|
| 194 |
action_str = f"OFFER {action_price}"
|
| 195 |
last_agent_offer = action_price
|
| 196 |
-
elif action_str.startswith("OFFER"):
|
| 197 |
-
last_agent_offer = action_price
|
| 198 |
|
| 199 |
# ββ Step the environment ββ
|
| 200 |
obs, reward, done, info = env.step(action_str, action_price)
|
|
|
|
| 72 |
|
| 73 |
target_goal = "buy for as low as possible (below your maximum value)" if obs.role == "buyer" else "sell for as high as possible (above your minimum value)"
|
| 74 |
|
| 75 |
+
prompt = f"""You are an expert negotiator acting as a {obs.role}. Your objective is to {target_goal} and maximize your profit.
|
| 76 |
|
| 77 |
CURRENT STATE:
|
| 78 |
* Your PRIVATE Valuation: {obs.agent_value} (your absolute limit β NEVER go past this)
|
|
|
|
| 81 |
* Opponent's last action: {obs.last_opponent_action}
|
| 82 |
* Opponent's last offer: {obs.last_opponent_offer}
|
| 83 |
|
| 84 |
+
{history_text}STRATEGY:
|
| 85 |
+
- Start your first offer at about 40-50% of the opening price. {"As a buyer with valuation " + str(obs.agent_value) + ", aim to pay as LITTLE as possible β profit = valuation minus price." if obs.role == "buyer" else "As a seller with valuation " + str(obs.agent_value) + ", aim to sell as HIGH as possible β profit = price minus valuation."}
|
| 86 |
+
- Concede slowly each round (50-80 per round), watching the opponent move toward you.
|
| 87 |
+
- If the opponent's counter is {"below" if obs.role == "buyer" else "above"} {obs.agent_value}, ACCEPT it β that's guaranteed profit!
|
| 88 |
+
- Close within 3-5 rounds for best time bonus.
|
| 89 |
+
- NEVER REJECT β rejection = -50 penalty.
|
| 90 |
|
| 91 |
+
HARD RULE: {"Your offer must be BELOW " + str(obs.agent_value) + ". Offering above it loses you money." if obs.role == "buyer" else "Your offer must be ABOVE " + str(obs.agent_value) + ". Offering below it loses you money."}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
Choose ONE action:
|
| 94 |
+
* OFFER <price>
|
| 95 |
+
* ACCEPT
|
| 96 |
+
* REJECT
|
| 97 |
|
| 98 |
+
Respond with ONLY your action. Example: OFFER 450"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
action_str = "REJECT"
|
| 101 |
action_price = 0
|
|
|
|
| 144 |
action_str = "REJECT"
|
| 145 |
action_price = 0
|
| 146 |
|
| 147 |
+
# ββ Safety guardrails ββ
|
| 148 |
+
# ACCEPT guard: never accept a deal worse than our valuation
|
| 149 |
if action_str == "ACCEPT":
|
| 150 |
opp_offer = obs.last_opponent_offer
|
| 151 |
if obs.role == "buyer" and opp_offer > obs.agent_value:
|
|
|
|
| 152 |
action_str = "OFFER"
|
| 153 |
+
action_price = last_agent_offer + 50 if last_agent_offer else int(obs.agent_value * 0.6)
|
| 154 |
elif obs.role == "seller" and opp_offer < obs.agent_value:
|
| 155 |
action_str = "OFFER"
|
| 156 |
+
action_price = last_agent_offer - 50 if last_agent_offer else int(obs.agent_value * 1.4)
|
| 157 |
|
| 158 |
+
# Valuation clamp: never offer past our own limit
|
| 159 |
if action_str.startswith("OFFER") and action_price > 0:
|
|
|
|
| 160 |
if obs.role == "buyer":
|
| 161 |
action_price = min(action_price, obs.agent_value - 10)
|
| 162 |
else:
|
| 163 |
action_price = max(action_price, obs.agent_value + 10)
|
| 164 |
|
| 165 |
+
# Concession cap: max 120 per round to prevent panic jumps
|
|
|
|
| 166 |
if last_agent_offer is not None:
|
| 167 |
if obs.role == "buyer":
|
| 168 |
+
action_price = min(action_price, last_agent_offer + 120)
|
| 169 |
else:
|
| 170 |
+
action_price = max(action_price, last_agent_offer - 120)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
action_str = f"OFFER {action_price}"
|
| 173 |
last_agent_offer = action_price
|
|
|
|
|
|
|
| 174 |
|
| 175 |
# ββ Step the environment ββ
|
| 176 |
obs, reward, done, info = env.step(action_str, action_price)
|