Spaces:

modelbuilderhq
/

ghostexec

Sleeping

App Files Files Community

ghostexec / server /reward.py

modelbuilderhq

Upload folder using huggingface_hub

ee21104 verified 11 days ago

raw

history blame contribute delete

17.3 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Phase 4 reward: weighted (0.35 / 0.35 / 0.30) with potential-style deltas, critical-queue
	shaping, full sub-scores even on invalid steps (+ explicit invalid penalty), and mild output
	scaling.
	"""

	from __future__ import annotations

	import math
	from datetime import datetime, timedelta, timezone
	from typing import Any

	try:
	from ..models import GhostexecAction, RewardBreakdown, WorldState
	except ImportError:
	from models import GhostexecAction, RewardBreakdown, WorldState

	W_CONFLICT = 0.35
	W_REL = 0.35
	W_TASK = 0.30

	# Raw conflict units (pre-weight) are clamped to keep invalid / idle steps from exploding.
	CONFLICT_RAW_CAP: float = 6.0

	# Scales the weighted sum of the three channels (weights stay fixed per hackathon rules).
	WEIGHTED_OUTPUT_SCALE: float = 0.48

	# Tone misfit penalties kept small vs outcome terms (~<20% of a strong +2 conflict step after weights).
	TONE_PENALTY_CASUAL_ANGRY_BOARD: float = 0.35
	TONE_PENALTY_FORMAL_PERSONAL: float = 0.08

	_RESOLVE_MICRO_BONUS: float = 0.12
	_CRITICAL_PER_EMAIL_BONUS: float = 0.22
	_RESCHEDULE_VALID_MICRO_BONUS: float = 0.10
	_SEND_MESSAGE_VALID_MICRO_BONUS: float = 0.08
	_COMPLETE_TASK_VALID_MICRO_BONUS: float = 0.06
	_DELEGATE_TASK_VALID_MICRO_BONUS: float = 0.10
	_DO_NOTHING_STRICT_PENALTY: float = -0.15
	_SYNERGY_CAP: float = 0.40
	_TRADEOFF_CAP: float = 0.30
	_POTENTIAL_CAP: float = 0.25
	_SCAFFOLD_CAP: float = 0.35
	_SHAPING_TO_BASE_BUDGET: float = 1.25
	_QUALITY_CAP: float = 0.28
	_REPLY_PRIORITY_MICRO_BONUS: dict[str, float] = {
	"critical": 0.30,
	"high": 0.15,
	"normal": 0.04,
	"low": 0.02,
	}

	_MOOD_RANK: dict[str, int] = {
	"happy": 4,
	"neutral": 3,
	"annoyed": 2,
	"angry": 1,
	"furious": 0,
	}


	def _parse_dt(value: str) -> datetime:
	if value.endswith("Z"):
	return datetime.fromisoformat(value[:-1]).replace(tzinfo=timezone.utc)
	dt = datetime.fromisoformat(value)
	if dt.tzinfo is None:
	return dt.replace(tzinfo=timezone.utc)
	return dt


	def _meeting_end(m: Any) -> datetime:
	start = _parse_dt(m.start)
	return start + timedelta(minutes=m.duration_minutes)


	def _overlap(a0: datetime, a1: datetime, b0: datetime, b1: datetime) -> bool:
	return a0 < b1 and b0 < a1


	def meeting_conflicts(world: WorldState) -> list[dict[str, Any]]:
	active = [m for m in world.meetings if not m.cancelled]
	out: list[dict[str, Any]] = []
	for i, a in enumerate(active):
	a0, a1 = _parse_dt(a.start), _meeting_end(a)
	for b in active[i + 1 :]:
	b0, b1 = _parse_dt(b.start), _meeting_end(b)
	if _overlap(a0, a1, b0, b1):
	o0, o1 = max(a0, b0), min(a1, b1)
	out.append(
	{
	"meeting_a": a.id,
	"meeting_b": b.id,
	"overlap_start": o0.isoformat(),
	"overlap_end": o1.isoformat(),
	}
	)
	return out


	def _pair_set(rows: list[dict[str, Any]]) -> set[frozenset[str]]:
	return {frozenset((r["meeting_a"], r["meeting_b"])) for r in rows}


	def _attendee_moods_ok(world: WorldState, pair: frozenset[str]) -> bool:
	names: set[str] = set()
	for mid in pair:
	m = next((x for x in world.meetings if x.id == mid), None)
	if m:
	names.update(m.attendees)
	for n in names:
	c = next((x for x in world.contacts if x.name == n), None)
	if c is None:
	continue
	if c.mood not in ("happy", "neutral"):
	return False
	return True


	def score_conflict_resolution(
	before: WorldState,
	after: WorldState,
	action: GhostexecAction,
	*,
	action_ok: bool,
	) -> float:
	b = _pair_set(meeting_conflicts(before))
	a = _pair_set(meeting_conflicts(after))
	s = 0.0
	for _p in b - a:
	s += 2.0 + _RESOLVE_MICRO_BONUS
	if _attendee_moods_ok(after, _p):
	s += 1.0
	for _ in a - b:
	s -= 3.0
	if action_ok and action.action_type == "reschedule_meeting":
	s += _RESCHEDULE_VALID_MICRO_BONUS
	return s


	def critical_unreplied_count(world: WorldState) -> int:
	return sum(1 for e in world.emails if e.priority == "critical" and not e.replied)


	def score_critical_queue_bonus(before: WorldState, after: WorldState) -> float:
	reduction = critical_unreplied_count(before) - critical_unreplied_count(after)
	return _CRITICAL_PER_EMAIL_BONUS * max(0, reduction)


	def _classify_tone(text: str) -> str:
	t = text.lower()
	if any(w in t for w in ("sorry", "apologize", "apologies", "my mistake")):
	return "apologetic"
	if any(w in t for w in ("dear ", "sincerely", "best regards", "respectfully", "cordially")):
	return "formal"
	if any(w in t for w in ("hey", "lol", "haha", "👋", "no worries", "cheers")):
	return "casual"
	if any(w in t for w in ("must", "immediately", "asap", "non-negotiable", "demand")):
	return "assertive"
	return "neutral"


	def score_relationship(
	before: WorldState,
	after: WorldState,
	action: GhostexecAction,
	*,
	action_ok: bool,
	relationship_suppressed_for_email_to: frozenset[str] \| None = None,
	) -> float:
	rel_sup = relationship_suppressed_for_email_to or frozenset()
	s = 0.0
	before_map = {c.name: c for c in before.contacts}
	after_map = {c.name: c for c in after.contacts}
	for name, ca in after_map.items():
	cb = before_map.get(name)
	if not cb:
	continue
	ra, rb = _MOOD_RANK[ca.mood], _MOOD_RANK[cb.mood]
	vip = ca.importance >= 4
	if ra > rb:
	s += 3.0 if vip else 1.0
	elif ra < rb:
	s -= 4.0 if vip else 2.0

	if action.action_type == "reply_email" and action.email_id:
	em = next((e for e in before.emails if e.id == action.email_id), None)
	if em and em.sender in rel_sup:
	return 0.0
	if em:
	if action_ok and (action.message_body or "").strip():
	pri = (em.priority or "").lower()
	micro = _REPLY_PRIORITY_MICRO_BONUS.get(pri, 0.0)
	if em.sender_relationship == "VIP":
	micro *= 2.0
	s += micro
	tone = _classify_tone(action.message_body)
	contact = next((c for c in before.contacts if c.name == em.sender), None)
	if (
	contact
	and contact.relationship_type == "board_member"
	and contact.mood in ("angry", "furious", "annoyed")
	and tone == "casual"
	):
	s -= TONE_PENALTY_CASUAL_ANGRY_BOARD
	if em.sender_relationship == "personal" and tone == "formal":
	s -= TONE_PENALTY_FORMAL_PERSONAL
	if action_ok and action.action_type == "send_message" and action.contact_name:
	known_contact = any(c.name == action.contact_name for c in before.contacts)
	if known_contact and (action.message_body or "").strip():
	s += _SEND_MESSAGE_VALID_MICRO_BONUS
	return s


	def _overdue_tasks(world: WorldState) -> list[Any]:
	now = _parse_dt(world.simulation_time)
	out = []
	for t in world.tasks:
	if t.status == "done":
	continue
	if _parse_dt(t.deadline) < now:
	out.append(t)
	return out


	def score_task_completion(
	before: WorldState,
	after: WorldState,
	action: GhostexecAction,
	*,
	action_ok: bool,
	) -> float:
	s = 0.0
	now = _parse_dt(after.simulation_time)

	before_tasks = {t.id: t for t in before.tasks}
	after_tasks = {t.id: t for t in after.tasks}
	for tid, ta in after_tasks.items():
	tb = before_tasks.get(tid)
	if not tb:
	continue
	if tb.status != "overdue" and tb.status != "done" and ta.status == "overdue":
	s -= 2.0
	if tb.status != "done" and ta.status == "done":
	dl = _parse_dt(tb.deadline)
	if dl >= now:
	s += 2.0
	else:
	s += 0.5
	if (not tb.delegated_to) and ta.delegated_to:
	de = next((c for c in after.contacts if c.name == ta.delegated_to), None)
	if de and de.importance <= 3:
	s += 1.0
	if action_ok and action.action_type == "complete_task":
	s += _COMPLETE_TASK_VALID_MICRO_BONUS
	if action_ok and action.action_type == "delegate_task":
	s += _DELEGATE_TASK_VALID_MICRO_BONUS
	return s


	def catastrophic(world: WorldState) -> bool:
	vip_furious = any(c.importance >= 4 and c.mood == "furious" for c in world.contacts)
	critical_open = sum(1 for e in world.emails if e.priority == "critical" and not e.replied)
	return vip_furious or critical_open > 3


	def _scaffold_learning_signal(
	before: WorldState,
	after: WorldState,
	action: GhostexecAction,
	*,
	action_ok: bool,
	step_index: int \| None,
	max_steps: int \| None,
	) -> float:
	if not action_ok:
	return 0.0
	if action.action_type == "do_nothing":
	return 0.0
	s = 0.0
	critical_before = critical_unreplied_count(before)
	critical_after = critical_unreplied_count(after)
	conflict_before = len(meeting_conflicts(before))
	conflict_after = len(meeting_conflicts(after))
	overdue_before = len(_overdue_tasks(before))
	overdue_after = len(_overdue_tasks(after))
	if action.action_type == "reply_email":
	if critical_after < critical_before:
	s += 0.16
	elif critical_before > 0:
	s += 0.05
	if action.action_type in ("reschedule_meeting", "cancel_meeting"):
	if conflict_after < conflict_before:
	s += 0.15
	elif conflict_before > 0:
	s += 0.04
	if action.action_type in ("complete_task", "delegate_task"):
	if overdue_after < overdue_before:
	s += 0.12
	elif overdue_before > 0:
	s += 0.03
	# Early episode shaping slightly amplified for better exploration guidance.
	if step_index is not None and max_steps and max_steps > 0:
	frac = max(0.0, min(1.0, step_index / max_steps))
	if frac <= 0.33:
	s *= 1.20
	elif frac >= 0.85:
	s *= 0.90

	return max(-_SCAFFOLD_CAP, min(_SCAFFOLD_CAP, s))


	def _state_potential(world: WorldState) -> float:
	conflicts = len(meeting_conflicts(world))
	critical_open = critical_unreplied_count(world)
	overdue = len(_overdue_tasks(world))
	stress = float(world.stress)
	# Lower operational pressure => higher potential.
	return -(
	1.15 * critical_open
	+ 0.90 * conflicts
	+ 0.55 * overdue
	+ 0.02 * stress
	)


	def _budgeted_shaping_total(base_weighted_inner: float, shaping_total_inner: float) -> float:
	# Keep shaping informative but bounded against the base objective to avoid exploit loops.
	budget = _SHAPING_TO_BASE_BUDGET * (abs(base_weighted_inner) + 0.05)
	return max(-budget, min(budget, shaping_total_inner))


	def _quality_separation_signal(
	*,
	c: float,
	r: float,
	t: float,
	action: GhostexecAction,
	action_ok: bool,
	) -> float:
	# Amplify distance between clearly good vs clearly bad valid actions.
	if not action_ok or action.action_type == "do_nothing":
	return 0.0
	base = W_CONFLICT * c + W_REL * r + W_TASK * t
	if base >= 0.90:
	return _QUALITY_CAP
	if base >= 0.35:
	return 0.12
	if base <= -0.90:
	return -_QUALITY_CAP
	if base <= -0.35:
	return -0.12
	return 0.0


	def aggregate_scores(
	conflict: float,
	relationship: float,
	task: float,
	*,
	conflict_raw: float,
	critical_queue_bonus: float,
	weighted_inner: float,
	weighted_base_only: float,
	shaping_synergy: float,
	shaping_tradeoff: float,
	shaping_potential: float,
	shaping_scaffold: float,
	shaping_quality: float,
	action_ok: bool,
	episode_done: bool,
	world_after: WorldState,
	) -> RewardBreakdown:
	weighted = WEIGHTED_OUTPUT_SCALE * weighted_inner
	weighted_base_only_scaled = WEIGHTED_OUTPUT_SCALE * weighted_base_only
	shaping_total = WEIGHTED_OUTPUT_SCALE * (
	shaping_synergy + shaping_tradeoff + shaping_potential + shaping_scaffold + shaping_quality
	)
	denom = abs(weighted_base_only_scaled) + 1e-6
	shaping_ratio = min(10.0, abs(shaping_total) / denom)
	inv = 0.0
	if not action_ok:
	inv = -0.25
	bonus = 0.0
	cata = 0.0
	if episode_done:
	if world_after.stress < 40:
	bonus = 10.0
	if catastrophic(world_after):
	cata = -15.0
	final = weighted + inv + bonus + cata
	return RewardBreakdown(
	conflict_raw=conflict_raw,
	critical_queue_bonus=critical_queue_bonus,
	conflict=conflict,
	relationship=relationship,
	task=task,
	shaping_synergy=WEIGHTED_OUTPUT_SCALE * shaping_synergy,
	shaping_tradeoff=WEIGHTED_OUTPUT_SCALE * shaping_tradeoff,
	shaping_potential=WEIGHTED_OUTPUT_SCALE * shaping_potential,
	shaping_scaffold=WEIGHTED_OUTPUT_SCALE * shaping_scaffold,
	shaping_quality=WEIGHTED_OUTPUT_SCALE * shaping_quality,
	shaping_total=shaping_total,
	shaping_to_base_ratio=shaping_ratio,
	weighted_base=weighted,
	output_scale=WEIGHTED_OUTPUT_SCALE,
	invalid_step_adjustment=inv,
	episode_completion_bonus=bonus,
	catastrophic_penalty=cata,
	do_nothing_floor=0.0,
	final=final,
	)


	def apply_do_nothing_penalty_floor(
	action: GhostexecAction,
	breakdown: RewardBreakdown,
	) -> RewardBreakdown:
	if action.action_type != "do_nothing":
	return breakdown
	floor_delta = _DO_NOTHING_STRICT_PENALTY
	new_final = breakdown.final + floor_delta
	return breakdown.model_copy(
	update={"do_nothing_floor": floor_delta, "final": new_final},
	)


	def compute_step_reward(
	before: WorldState,
	after: WorldState,
	action: GhostexecAction,
	*,
	action_ok: bool,
	episode_done: bool,
	relationship_suppressed_for_email_to: frozenset[str] \| None = None,
	reward_mode: str = "full",
	step_index: int \| None = None,
	max_steps: int \| None = None,
	) -> RewardBreakdown:
	c_core = score_conflict_resolution(before, after, action, action_ok=action_ok)
	crit_b = score_critical_queue_bonus(before, after)
	c_raw = c_core + crit_b
	c = max(-CONFLICT_RAW_CAP, min(CONFLICT_RAW_CAP, c_raw))
	r = score_relationship(
	before,
	after,
	action,
	action_ok=action_ok,
	relationship_suppressed_for_email_to=relationship_suppressed_for_email_to,
	)
	t = score_task_completion(before, after, action, action_ok=action_ok)
	weighted_base_only = W_CONFLICT * c + W_REL * r + W_TASK * t
	weighted_inner = weighted_base_only
	synergy = 0.0
	tradeoff_penalty = 0.0
	potential_progress = 0.0
	scaffold_signal = 0.0
	quality_signal = 0.0
	if reward_mode in ("full", "shaping"):
	# Bounded nonlinear shaping to speed learning without overpowering base channels.
	if c > 0.0 and r > 0.0:
	synergy += min(_SYNERGY_CAP, 0.18 * math.tanh(0.35 * c * r))
	if t > 0.0 and (c > 0.0 or r > 0.0):
	bridge = max(c, 0.0) + max(r, 0.0)
	synergy += min(_SYNERGY_CAP, 0.10 * math.tanh(0.25 * t * bridge))
	if c < -0.5 and r < -0.5:
	tradeoff_penalty -= min(_TRADEOFF_CAP, 0.12 * math.tanh(0.25 * abs(c * r)))
	if t < -0.5 and (c < 0.0 or r < 0.0):
	debt = abs(t) * (abs(min(c, 0.0)) + abs(min(r, 0.0)))
	tradeoff_penalty -= min(_TRADEOFF_CAP, 0.08 * math.tanh(0.18 * debt))
	potential_progress = max(
	-_POTENTIAL_CAP,
	min(_POTENTIAL_CAP, _state_potential(after) - _state_potential(before)),
	)
	scaffold_signal = _scaffold_learning_signal(
	before,
	after,
	action,
	action_ok=action_ok,
	step_index=step_index,
	max_steps=max_steps,
	)
	quality_signal = _quality_separation_signal(
	c=c,
	r=r,
	t=t,
	action=action,
	action_ok=action_ok,
	)
	shaping_total_inner = (
	synergy + tradeoff_penalty + potential_progress + scaffold_signal + quality_signal
	)
	weighted_inner += _budgeted_shaping_total(weighted_base_only, shaping_total_inner)
	bd = aggregate_scores(
	c,
	r,
	t,
	conflict_raw=c_raw,
	critical_queue_bonus=crit_b,
	weighted_inner=weighted_inner,
	weighted_base_only=weighted_base_only,
	shaping_synergy=synergy,
	shaping_tradeoff=tradeoff_penalty,
	shaping_potential=potential_progress,
	shaping_scaffold=scaffold_signal,
	shaping_quality=quality_signal,
	action_ok=action_ok,
	episode_done=episode_done,
	world_after=after,
	)
	return apply_do_nothing_penalty_floor(action, bd)