MridulNegi2005 commited on
Commit
3f2a3ab
Β·
1 Parent(s): a5c1817

Major env overhaul: opponent negotiates naturally, gentler time penalty, relative aggression, simplified agent

Browse files
__pycache__/env_wrapper.cpython-312.pyc CHANGED
Binary files a/__pycache__/env_wrapper.cpython-312.pyc and b/__pycache__/env_wrapper.cpython-312.pyc differ
 
env_wrapper.py CHANGED
@@ -81,10 +81,15 @@ class Opponent:
81
  return "REJECT", 0
82
 
83
  # ── Acceptance Check ──
84
- if self.opponent_role == "seller" and agent_offer >= self.opponent_value:
85
- self.history.append({"round": round_num, "action": "ACCEPT", "price": agent_offer})
86
- return "ACCEPT", agent_offer
87
- if self.opponent_role == "buyer" and agent_offer <= self.opponent_value:
 
 
 
 
 
88
  self.history.append({"round": round_num, "action": "ACCEPT", "price": agent_offer})
89
  return "ACCEPT", agent_offer
90
 
@@ -192,8 +197,9 @@ class EnvWrapper:
192
  else:
193
  profit = self.agent_value - deal_price
194
 
195
- # Softer time decay: sqrt penalizes less harshly in early rounds
196
- time_factor = 1.0 - (self.round / self.max_rounds) ** 0.5
 
197
  base_reward = profit * time_factor
198
 
199
  # Penalty for bad deals (agent accepts a losing deal)
@@ -268,7 +274,10 @@ class EnvWrapper:
268
  action_str = f"OFFER {action_price}"
269
 
270
  # ── CUMULATIVE AGGRESSION PENALTY ──
271
- if abs(action_price - self.opponent_value) > 150:
 
 
 
272
  self.cumulative_aggression_penalty += 2.0
273
 
274
  # Record this step in history
 
81
  return "REJECT", 0
82
 
83
  # ── Acceptance Check ──
84
+ # Opponent negotiates for a minimum number of rounds before accepting.
85
+ # Greedy opponents hold out longer; impatient ones settle sooner.
86
+ min_round_to_accept = max(2, self.patience // 3)
87
+
88
+ offer_acceptable = (
89
+ (self.opponent_role == "seller" and agent_offer >= self.opponent_value) or
90
+ (self.opponent_role == "buyer" and agent_offer <= self.opponent_value)
91
+ )
92
+ if offer_acceptable and round_num >= min_round_to_accept:
93
  self.history.append({"round": round_num, "action": "ACCEPT", "price": agent_offer})
94
  return "ACCEPT", agent_offer
95
 
 
197
  else:
198
  profit = self.agent_value - deal_price
199
 
200
+ # Gentle time decay: linear, max 50% loss even if all rounds used.
201
+ # This rewards fast deals but doesn't destroy multi-round negotiation.
202
+ time_factor = 1.0 - 0.5 * (self.round / self.max_rounds)
203
  base_reward = profit * time_factor
204
 
205
  # Penalty for bad deals (agent accepts a losing deal)
 
274
  action_str = f"OFFER {action_price}"
275
 
276
  # ── CUMULATIVE AGGRESSION PENALTY ──
277
+ # Scale threshold to ZOPA width so narrow-ZOPA tasks aren't unfairly punished
278
+ zopa = abs(self.agent_value - self.opponent_value)
279
+ aggression_threshold = max(100, int(zopa * 1.25))
280
+ if abs(action_price - self.opponent_value) > aggression_threshold:
281
  self.cumulative_aggression_penalty += 2.0
282
 
283
  # Record this step in history
inference.py CHANGED
@@ -72,7 +72,7 @@ def run_task(client, model_name: str, task_config):
72
 
73
  target_goal = "buy for as low as possible (below your maximum value)" if obs.role == "buyer" else "sell for as high as possible (above your minimum value)"
74
 
75
- prompt = f"""You are an expert negotiator acting as a {obs.role}. Your objective is to {target_goal} and maximize your profit through strategic multi-round bargaining.
76
 
77
  CURRENT STATE:
78
  * Your PRIVATE Valuation: {obs.agent_value} (your absolute limit β€” NEVER go past this)
@@ -81,28 +81,21 @@ CURRENT STATE:
81
  * Opponent's last action: {obs.last_opponent_action}
82
  * Opponent's last offer: {obs.last_opponent_offer}
83
 
84
- {history_text}YOUR NEGOTIATION PLAYBOOK:
 
 
 
 
 
85
 
86
- ROUND-BY-ROUND STRATEGY (you are a {obs.role}):
87
- {("- Round 1: You only have " + str(obs.max_rounds) + " rounds! Start at about 60-65% of your own valuation (" + str(obs.agent_value) + ") as your first offer. Then increase quickly by 100+ each round.") if obs.max_rounds <= 8 else ("- Round 1: Start AGGRESSIVE. Offer around 30-35% of the opponent's opening price. This anchors the negotiation in your favor." if obs.role == "buyer" else "- Round 1: Start AGGRESSIVE. Offer around 2-3x your minimum value. This anchors the negotiation in your favor.")}
88
- - Round 2-3: Concede moderately. {"Increase" if obs.role == "buyer" else "Decrease"} your offer to find their breaking point.
89
- - Round 3-4: If the opponent's counter-offer is profitable for you ({"below" if obs.role == "buyer" else "above"} your valuation), ACCEPT it. Otherwise make one final offer near the midpoint.
90
- - Round 5+: You are running out of time. ACCEPT any profitable deal immediately.
91
 
92
- ABSOLUTE LIMIT: {"Your offer must NEVER exceed " + str(obs.agent_value) + ". Any offer above " + str(obs.agent_value) + " loses you money!" if obs.role == "buyer" else "Your offer must NEVER go below " + str(obs.agent_value) + ". Any offer below " + str(obs.agent_value) + " loses you money!"}
 
 
 
93
 
94
- SCORING RULES:
95
- 1. PROFIT MATTERS MOST: Your score = (your profit) Γ— (time bonus). A great deal on round 3 beats a mediocre deal on round 1.
96
- 2. TIME BONUS: Decreases each round. Don't drag past round 5.
97
- 3. AGGRESSION PENALTY: Offers extremely far from reasonable (e.g., offering 100 when market is 500+) are penalized. Stay within a plausible range.
98
- 4. NEVER REJECT β€” a bad deal is almost always better than no deal (rejection = -50 penalty).
99
-
100
- Choose exactly ONE action:
101
- * OFFER <price> β€” counter-offer ({"must be below " + str(obs.agent_value) if obs.role == "buyer" else "must be above " + str(obs.agent_value)})
102
- * ACCEPT β€” accept if the opponent's offer gives you good profit
103
- * REJECT β€” walk away (almost never do this)
104
-
105
- Respond with ONLY your action. Example: OFFER 350"""
106
 
107
  action_str = "REJECT"
108
  action_price = 0
@@ -151,50 +144,33 @@ Respond with ONLY your action. Example: OFFER 350"""
151
  action_str = "REJECT"
152
  action_price = 0
153
 
154
- # ── ACCEPT guard: never accept a losing deal ──
 
155
  if action_str == "ACCEPT":
156
  opp_offer = obs.last_opponent_offer
157
  if obs.role == "buyer" and opp_offer > obs.agent_value:
158
- # Opponent wants more than our max β€” counter instead
159
  action_str = "OFFER"
160
- action_price = last_agent_offer + 80 if last_agent_offer else int(obs.agent_value * 0.6)
161
  elif obs.role == "seller" and opp_offer < obs.agent_value:
162
  action_str = "OFFER"
163
- action_price = last_agent_offer - 80 if last_agent_offer else int(obs.agent_value * 1.4)
164
 
165
- # ── Smart offer clamping ──
166
  if action_str.startswith("OFFER") and action_price > 0:
167
- # Hard limit: never cross own valuation
168
  if obs.role == "buyer":
169
  action_price = min(action_price, obs.agent_value - 10)
170
  else:
171
  action_price = max(action_price, obs.agent_value + 10)
172
 
173
- # Adaptive concession cap: short games need bigger steps
174
- max_step = max(80, 1200 // env.max_rounds)
175
  if last_agent_offer is not None:
176
  if obs.role == "buyer":
177
- action_price = min(action_price, last_agent_offer + max_step)
178
  else:
179
- action_price = max(action_price, last_agent_offer - max_step)
180
-
181
- # Target price: aim for ~40% of the gap from agent's value
182
- # This is where the best profit-vs-time tradeoff lives
183
- gap = abs(obs.agent_value - obs.current_offer)
184
- if obs.role == "buyer":
185
- target = obs.agent_value - int(gap * 0.4) # aim to buy well below value
186
- # Don't let round 2+ offers go above target unless desperate (round 4+)
187
- if step_n <= 3 and last_agent_offer is not None:
188
- action_price = min(action_price, max(target, last_agent_offer + 50))
189
- else:
190
- target = obs.agent_value + int(gap * 0.4)
191
- if step_n <= 3 and last_agent_offer is not None:
192
- action_price = max(action_price, min(target, last_agent_offer - 50))
193
 
194
  action_str = f"OFFER {action_price}"
195
  last_agent_offer = action_price
196
- elif action_str.startswith("OFFER"):
197
- last_agent_offer = action_price
198
 
199
  # ── Step the environment ──
200
  obs, reward, done, info = env.step(action_str, action_price)
 
72
 
73
  target_goal = "buy for as low as possible (below your maximum value)" if obs.role == "buyer" else "sell for as high as possible (above your minimum value)"
74
 
75
+ prompt = f"""You are an expert negotiator acting as a {obs.role}. Your objective is to {target_goal} and maximize your profit.
76
 
77
  CURRENT STATE:
78
  * Your PRIVATE Valuation: {obs.agent_value} (your absolute limit β€” NEVER go past this)
 
81
  * Opponent's last action: {obs.last_opponent_action}
82
  * Opponent's last offer: {obs.last_opponent_offer}
83
 
84
+ {history_text}STRATEGY:
85
+ - Start your first offer at about 40-50% of the opening price. {"As a buyer with valuation " + str(obs.agent_value) + ", aim to pay as LITTLE as possible β€” profit = valuation minus price." if obs.role == "buyer" else "As a seller with valuation " + str(obs.agent_value) + ", aim to sell as HIGH as possible β€” profit = price minus valuation."}
86
+ - Concede slowly each round (50-80 per round), watching the opponent move toward you.
87
+ - If the opponent's counter is {"below" if obs.role == "buyer" else "above"} {obs.agent_value}, ACCEPT it β€” that's guaranteed profit!
88
+ - Close within 3-5 rounds for best time bonus.
89
+ - NEVER REJECT β€” rejection = -50 penalty.
90
 
91
+ HARD RULE: {"Your offer must be BELOW " + str(obs.agent_value) + ". Offering above it loses you money." if obs.role == "buyer" else "Your offer must be ABOVE " + str(obs.agent_value) + ". Offering below it loses you money."}
 
 
 
 
92
 
93
+ Choose ONE action:
94
+ * OFFER <price>
95
+ * ACCEPT
96
+ * REJECT
97
 
98
+ Respond with ONLY your action. Example: OFFER 450"""
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  action_str = "REJECT"
101
  action_price = 0
 
144
  action_str = "REJECT"
145
  action_price = 0
146
 
147
+ # ── Safety guardrails ──
148
+ # ACCEPT guard: never accept a deal worse than our valuation
149
  if action_str == "ACCEPT":
150
  opp_offer = obs.last_opponent_offer
151
  if obs.role == "buyer" and opp_offer > obs.agent_value:
 
152
  action_str = "OFFER"
153
+ action_price = last_agent_offer + 50 if last_agent_offer else int(obs.agent_value * 0.6)
154
  elif obs.role == "seller" and opp_offer < obs.agent_value:
155
  action_str = "OFFER"
156
+ action_price = last_agent_offer - 50 if last_agent_offer else int(obs.agent_value * 1.4)
157
 
158
+ # Valuation clamp: never offer past our own limit
159
  if action_str.startswith("OFFER") and action_price > 0:
 
160
  if obs.role == "buyer":
161
  action_price = min(action_price, obs.agent_value - 10)
162
  else:
163
  action_price = max(action_price, obs.agent_value + 10)
164
 
165
+ # Concession cap: max 120 per round to prevent panic jumps
 
166
  if last_agent_offer is not None:
167
  if obs.role == "buyer":
168
+ action_price = min(action_price, last_agent_offer + 120)
169
  else:
170
+ action_price = max(action_price, last_agent_offer - 120)
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  action_str = f"OFFER {action_price}"
173
  last_agent_offer = action_price
 
 
174
 
175
  # ── Step the environment ──
176
  obs, reward, done, info = env.step(action_str, action_price)