Spaces:

ycwhencpp
/

final-iteration

Paused

vaibhav12332112312 commited on 12 days ago

Commit

9ee7a09

1 Parent(s): 56f70b1

train: per-step credit + drop replies + larger batches

- run_llm_episode now computes Monte-Carlo return-to-go per day
(G_t = r_t + 0.95*G_{t+1}, terminal = grader_score*5); each pair
gets its own return for top-K filtering, removing the
same-reward-per-pair caveat.
- SFTConfig: per_device_train_batch_size 16->32, bf16, fused AdamW,
warmup_ratio=0.1, no grad checkpointing for ~90% VRAM use on 48GB.
- SYSTEM_PROMPT rewritten as full tool/action schema; removed
subjective rules (optimal posts, diversity bonus); reach-bonus
clarified.
- format_obs no longer truncates tool results; history window 4->14
messages (7 days); removed hard-coded auto-rest heuristic.
- Drop replies feature across env, models, client, inference, and
training scripts.
- NUM_ROUNDS/EPISODES_PER_ROUND set to 1 for smoke run.

Made-with: Cursor

Files changed (9) hide show

__init__.py +0 -2
client.py +0 -6
inference.py +0 -5
models.py +0 -11
server/viraltest_environment.py +0 -17
training/hf_run_space_train_job.sh +3 -4
training/run_llm_training.py +2 -7
training/run_training_evidence.py +1 -14
training/train_grpo.ipynb +70 -36

__init__.py CHANGED Viewed

@@ -10,7 +10,6 @@ from .client import ViraltestEnv
 from .models import (
     CollabProposal,
     EngagementSignals,
-    ReplyAction,
     ScheduledAction,
     ToolCall,
     ToolResult,
@@ -21,7 +20,6 @@ from .models import (
 __all__ = [
     "CollabProposal",
     "EngagementSignals",
-    "ReplyAction",
     "ScheduledAction",
     "ToolCall",
     "ToolResult",

 from .models import (
     CollabProposal,
     EngagementSignals,
     ScheduledAction,
     ToolCall,
     ToolResult,
 __all__ = [
     "CollabProposal",
     "EngagementSignals",
     "ScheduledAction",
     "ToolCall",
     "ToolResult",

client.py CHANGED Viewed

@@ -43,12 +43,6 @@ class ViraltestEnv(EnvClient[ViraltestAction, ViraltestObservation, State]):
             actions_list.append(item)
         payload["scheduled_actions"] = actions_list
-        if action.replies:
-            payload["replies"] = [
-                {"post_hour": r.post_hour, "reply_hour": r.reply_hour}
-                for r in action.replies
-            ]
         if action.collab:
             payload["collab"] = {
                 "partner_id": action.collab.partner_id,

             actions_list.append(item)
         payload["scheduled_actions"] = actions_list
         if action.collab:
             payload["collab"] = {
                 "partner_id": action.collab.partner_id,

inference.py CHANGED Viewed

@@ -74,7 +74,6 @@ RESPONSE FORMAT (JSON only, no markdown, no prose):
     {"hour": 12, "action_type": "post", "content_type": "reel", "topic": "AI tools", "tags": ["ai", "coding"], "intent": "watch_bait"},
     {"hour": 18, "action_type": "post", "content_type": "carousel", "topic": "startup life", "tags": ["startup", "growth"], "intent": "save_bait"}
   ],
-  "replies": [{"post_hour": 12, "reply_hour": 13}],
   "notes": "Day 3: tech niche trending up. Competitor Alpha posted at 10am. Avoiding overlap."
 }
@@ -87,7 +86,6 @@ RULES:
 - Use notes to track hypotheses and observations across days
 - Tool calls cost API budget (starts at 100). Use wisely.
 - Max 2 collaborations per month
-- Reply within 90 minutes of a post for reach bonus
 Think strategically: use tools to discover what works, then exploit what you learn.""")
@@ -201,13 +199,11 @@ def parse_daily_plan(response_text: str) -> ViraltestAction:
                 if isinstance(a, dict):
                     scheduled.append(a)
-        replies_raw = data.get("replies", [])
         notes = data.get("notes")
         return ViraltestAction(
             tool_calls=tool_calls,
             scheduled_actions=scheduled,
-            replies=replies_raw if isinstance(replies_raw, list) else [],
             notes=notes,
         )
     except (json.JSONDecodeError, Exception):
@@ -236,7 +232,6 @@ def sanitize_predefined_topics(action: ViraltestAction, obs: Any) -> ViraltestAc
     return ViraltestAction(
         tool_calls=action.tool_calls,
         scheduled_actions=out,
-        replies=action.replies,
         collab=action.collab,
         notes=action.notes,
     )

     {"hour": 12, "action_type": "post", "content_type": "reel", "topic": "AI tools", "tags": ["ai", "coding"], "intent": "watch_bait"},
     {"hour": 18, "action_type": "post", "content_type": "carousel", "topic": "startup life", "tags": ["startup", "growth"], "intent": "save_bait"}
   ],
   "notes": "Day 3: tech niche trending up. Competitor Alpha posted at 10am. Avoiding overlap."
 }
 - Use notes to track hypotheses and observations across days
 - Tool calls cost API budget (starts at 100). Use wisely.
 - Max 2 collaborations per month
 Think strategically: use tools to discover what works, then exploit what you learn.""")
                 if isinstance(a, dict):
                     scheduled.append(a)
         notes = data.get("notes")
         return ViraltestAction(
             tool_calls=tool_calls,
             scheduled_actions=scheduled,
             notes=notes,
         )
     except (json.JSONDecodeError, Exception):
     return ViraltestAction(
         tool_calls=action.tool_calls,
         scheduled_actions=out,
         collab=action.collab,
         notes=action.notes,
     )

models.py CHANGED Viewed

@@ -56,13 +56,6 @@ class ScheduledAction(BaseModel):
         return v
-class ReplyAction(BaseModel):
-    """Reply to comments on a post made earlier today (within reply window)."""
-    post_hour: int = Field(..., ge=0, le=23, description="Hour of the post to reply on")
-    reply_hour: int = Field(..., ge=0, le=23, description="Hour to send replies")
 class CollabProposal(BaseModel):
     """Propose a collaboration with a competitor archetype."""
@@ -82,10 +75,6 @@ class ViraltestAction(Action):
         default_factory=list,
         description="Actions scheduled at specific hours; unlisted hours are rest",
     )
-    replies: List[ReplyAction] = Field(
-        default_factory=list,
-        description="Reply actions on posts made today (within 90-min window for reach bonus)",
-    )
     collab: Optional[CollabProposal] = Field(
         default=None,
         description="Optional collaboration proposal (max 2 per month)",

         return v
 class CollabProposal(BaseModel):
     """Propose a collaboration with a competitor archetype."""
         default_factory=list,
         description="Actions scheduled at specific hours; unlisted hours are rest",
     )
     collab: Optional[CollabProposal] = Field(
         default=None,
         description="Optional collaboration proposal (max 2 per month)",

server/viraltest_environment.py CHANGED Viewed

@@ -29,7 +29,6 @@ try:
         EngagementSignals,
         HeadlineMetrics,
         JudgeReport,
-        ReplyAction,
         ScheduledAction,
         ToolCall,
         ToolResult,
@@ -42,7 +41,6 @@ except ImportError:
         EngagementSignals,
         HeadlineMetrics,
         JudgeReport,
-        ReplyAction,
         ScheduledAction,
         ToolCall,
         ToolResult,
@@ -168,8 +166,6 @@ COLLAB_GROWTH_K = 1.50     # cross-pollination follower spillover, scales (1 - o
 COLLAB_PARTNER_REPEAT_PENALTY = 0.7  # discount on multipliers when partner reused this brand
 COLLAB_FATIGUE_K = 0.3     # per-collab diminishing-returns factor: 1/(1+K*prior_collabs_this_episode)
-REPLY_WINDOW_MINUTES = 90
-REPLY_REACH_BONUS = 1.4
 API_BUDGET_INITIAL = 100
 # Heuristic baselines for headline metric `vs_baseline_pct`.
@@ -847,19 +843,6 @@ class ViraltestEnvironment(Environment):
             if self._energy <= 0.0:
                 burned_out = True
-        # Process replies
-        for reply in action.replies:
-            if 0 <= reply.reply_hour < 24 and 0 <= reply.post_hour < 24:
-                diff_minutes = abs(reply.reply_hour - reply.post_hour) * 60
-                if diff_minutes <= REPLY_WINDOW_MINUTES:
-                    daily_engagement *= REPLY_REACH_BONUS
-                    daily_signals = EngagementSignals(
-                        watch_time=daily_signals.watch_time * REPLY_REACH_BONUS,
-                        sends_per_reach=daily_signals.sends_per_reach * REPLY_REACH_BONUS,
-                        saves=daily_signals.saves * REPLY_REACH_BONUS,
-                        likes_per_reach=daily_signals.likes_per_reach * REPLY_REACH_BONUS,
-                    )
         # Weekly tracking
         self._total_posts_this_week += daily_posts
         if self._day % 7 == 0 and self._day > 0:

         EngagementSignals,
         HeadlineMetrics,
         JudgeReport,
         ScheduledAction,
         ToolCall,
         ToolResult,
         EngagementSignals,
         HeadlineMetrics,
         JudgeReport,
         ScheduledAction,
         ToolCall,
         ToolResult,
 COLLAB_PARTNER_REPEAT_PENALTY = 0.7  # discount on multipliers when partner reused this brand
 COLLAB_FATIGUE_K = 0.3     # per-collab diminishing-returns factor: 1/(1+K*prior_collabs_this_episode)
 API_BUDGET_INITIAL = 100
 # Heuristic baselines for headline metric `vs_baseline_pct`.
             if self._energy <= 0.0:
                 burned_out = True
         # Weekly tracking
         self._total_posts_this_week += daily_posts
         if self._day % 7 == 0 and self._day > 0:

training/hf_run_space_train_job.sh CHANGED Viewed

@@ -22,13 +22,12 @@ REMOTE_SCRIPT=$(cat <<'EOS'
 set -euo pipefail
 export DEBIAN_FRONTEND=noninteractive
 apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates
-pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub
 rm -rf /work
 git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
 cd /work
-jupyter nbconvert --to notebook --execute training/train_grpo.ipynb \
-  --output train_grpo.executed.ipynb \
-  --ExecutePreprocessor.timeout="${NB_EXEC_TIMEOUT}"
 python -c "import os; from huggingface_hub import HfApi; HfApi().upload_folder(folder_path='.', path_in_repo='run-output', repo_id=os.environ['SPACE_REPO'], repo_type='space', allow_patterns=['training/train_grpo.executed.ipynb','plots/**','**/lora-*/**'])"
 EOS
 )

 set -euo pipefail
 export DEBIAN_FRONTEND=noninteractive
 apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates
+pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill
 rm -rf /work
 git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
 cd /work
+papermill --log-output --progress-bar --execution-timeout "${NB_EXEC_TIMEOUT}" \
+  training/train_grpo.ipynb training/train_grpo.executed.ipynb
 python -c "import os; from huggingface_hub import HfApi; HfApi().upload_folder(folder_path='.', path_in_repo='run-output', repo_id=os.environ['SPACE_REPO'], repo_type='space', allow_patterns=['training/train_grpo.executed.ipynb','plots/**','**/lora-*/**'])"
 EOS
 )

training/run_llm_training.py CHANGED Viewed

@@ -106,7 +106,6 @@ def plan_smart(obs_dict, day):
             ScheduledAction(hour=19, action_type="post", content_type=ct2,
                             topic=topic2, tags=tags2, intent=intent2),
         ],
-        replies=[{"post_hour": 12, "reply_hour": 13}],
     )
 BASELINE_AGENTS = {
@@ -157,17 +156,13 @@ RESPONSE FORMAT — return ONLY valid JSON, no markdown, no explanation:
   "scheduled_actions": [
     {"hour": 12, "action_type": "post", "content_type": "reel", "topic": "AI tools", "tags": ["ai", "coding"], "intent": "watch_bait"}
   ],
-  "replies": [{"post_hour": 12, "reply_hour": 13}],
   "notes": "strategy notes"
 }
 RULES:
 - hour: 0-23. content_type: reel|story|carousel|text_post
 - intent: send_bait|save_bait|watch_bait|like_bait
-- 1-2 posts per day is optimal. More = audience fatigue + energy drain.
-- Empty scheduled_actions = rest (recovers energy).
-- Vary content types and topics across days for diversity bonus.
-- Reply within 90 min of a post for reach bonus.""")
 LEARNED_ADDENDUM = """
@@ -253,7 +248,7 @@ def parse_model_output(text):
                     pass
         return ViraltestAction(
             tool_calls=tool_calls, scheduled_actions=scheduled,
-            replies=data.get("replies", []), notes=data.get("notes"),
         )
     except (json.JSONDecodeError, Exception):
         return ViraltestAction(scheduled_actions=[])

             ScheduledAction(hour=19, action_type="post", content_type=ct2,
                             topic=topic2, tags=tags2, intent=intent2),
         ],
     )
 BASELINE_AGENTS = {
   "scheduled_actions": [
     {"hour": 12, "action_type": "post", "content_type": "reel", "topic": "AI tools", "tags": ["ai", "coding"], "intent": "watch_bait"}
   ],
   "notes": "strategy notes"
 }
 RULES:
 - hour: 0-23. content_type: reel|story|carousel|text_post
 - intent: send_bait|save_bait|watch_bait|like_bait
+- Empty scheduled_actions = rest (recovers energy).""")
 LEARNED_ADDENDUM = """
                     pass
         return ViraltestAction(
             tool_calls=tool_calls, scheduled_actions=scheduled,
+            notes=data.get("notes"),
         )
     except (json.JSONDecodeError, Exception):
         return ViraltestAction(scheduled_actions=[])

training/run_training_evidence.py CHANGED Viewed

@@ -100,7 +100,6 @@ def plan_smart(obs_dict: dict, day: int) -> ViraltestAction:
             ScheduledAction(hour=19, action_type="post", content_type=ct2,
                             topic=topic2, tags=tags2, intent=intent2),
         ],
-        replies=[{"post_hour": 12, "reply_hour": 13}],
         notes=f"Day {day}: varied content at peak hours.",
     )
@@ -156,7 +155,6 @@ class PostingPolicy:
     tag_offset: int = 0
     topic_offset: int = 0
     create_hour: Optional[int] = None
-    use_reply: bool = False
     use_tools_early: bool = False
     rest_if_low_energy: float = 0.3
@@ -186,16 +184,9 @@ class PostingPolicy:
                 tool_calls.append(ToolCall(name="query_trends",
                                           arguments={"niche": NICHES[day % len(NICHES)]}))
-            replies = []
-            if policy.use_reply and policy.post_hours:
-                first_post = policy.post_hours[0]
-                if first_post < 23:
-                    replies = [{"post_hour": first_post, "reply_hour": first_post + 1}]
             return ViraltestAction(
                 tool_calls=tool_calls,
                 scheduled_actions=actions,
-                replies=replies,
                 notes=f"Day {day}: policy-driven plan.",
             )
         return plan_fn
@@ -208,13 +199,12 @@ class PostingPolicy:
             tag_offset=self.tag_offset,
             topic_offset=self.topic_offset,
             create_hour=self.create_hour,
-            use_reply=self.use_reply,
             use_tools_early=self.use_tools_early,
             rest_if_low_energy=self.rest_if_low_energy,
         )
         mutation = rng.choice(["hours", "types", "intents", "tags", "topics",
-                               "create", "reply", "tools", "energy", "n_posts"])
         if mutation == "hours":
             child.post_hours = sorted(rng.sample(range(6, 23), min(rng.randint(1, 3), 3)))
@@ -230,8 +220,6 @@ class PostingPolicy:
             child.topic_offset = rng.randint(0, len(ALL_TOPICS) - 1)
         elif mutation == "create":
             child.create_hour = rng.choice([None, 7, 8, 9, 10])
-        elif mutation == "reply":
-            child.use_reply = not child.use_reply
         elif mutation == "tools":
             child.use_tools_early = not child.use_tools_early
         elif mutation == "energy":
@@ -262,7 +250,6 @@ def evolutionary_search(
         tag_offset=rng.randint(0, len(TAG_POOL) - 1),
         topic_offset=rng.randint(0, len(ALL_TOPICS) - 1),
         create_hour=rng.choice([None, 7, 8, 9]),
-        use_reply=rng.random() > 0.5,
         use_tools_early=rng.random() > 0.5,
         rest_if_low_energy=rng.choice([0.2, 0.25, 0.3, 0.35]),
     ) for _ in range(population_size)]

             ScheduledAction(hour=19, action_type="post", content_type=ct2,
                             topic=topic2, tags=tags2, intent=intent2),
         ],
         notes=f"Day {day}: varied content at peak hours.",
     )
     tag_offset: int = 0
     topic_offset: int = 0
     create_hour: Optional[int] = None
     use_tools_early: bool = False
     rest_if_low_energy: float = 0.3
                 tool_calls.append(ToolCall(name="query_trends",
                                           arguments={"niche": NICHES[day % len(NICHES)]}))
             return ViraltestAction(
                 tool_calls=tool_calls,
                 scheduled_actions=actions,
                 notes=f"Day {day}: policy-driven plan.",
             )
         return plan_fn
             tag_offset=self.tag_offset,
             topic_offset=self.topic_offset,
             create_hour=self.create_hour,
             use_tools_early=self.use_tools_early,
             rest_if_low_energy=self.rest_if_low_energy,
         )
         mutation = rng.choice(["hours", "types", "intents", "tags", "topics",
+                               "create", "tools", "energy", "n_posts"])
         if mutation == "hours":
             child.post_hours = sorted(rng.sample(range(6, 23), min(rng.randint(1, 3), 3)))
             child.topic_offset = rng.randint(0, len(ALL_TOPICS) - 1)
         elif mutation == "create":
             child.create_hour = rng.choice([None, 7, 8, 9, 10])
         elif mutation == "tools":
             child.use_tools_early = not child.use_tools_early
         elif mutation == "energy":
         tag_offset=rng.randint(0, len(TAG_POOL) - 1),
         topic_offset=rng.randint(0, len(ALL_TOPICS) - 1),
         create_hour=rng.choice([None, 7, 8, 9]),
         use_tools_early=rng.random() > 0.5,
         rest_if_low_energy=rng.choice([0.2, 0.25, 0.3, 0.35]),
     ) for _ in range(population_size)]

training/train_grpo.ipynb CHANGED Viewed

@@ -301,8 +301,7 @@
         "                topic=ALL_TOPICS[(day*2+1)%len(ALL_TOPICS)],\n",
         "                tags=[TAG_POOL[(day*6+3+i)%len(TAG_POOL)] for i in range(3)],\n",
         "                intent=INTENTS[(day*2+1)%4]),\n",
-        "        ],\n",
-        "        replies=[{\"post_hour\": 12, \"reply_hour\": 13}])\n",
         "\n",
         "BASELINE_AGENTS = {\n",
         "    \"always_rest\": plan_always_rest, \"spam\": plan_spam,\n",
@@ -570,22 +569,38 @@
         "\n",
         "RESPONSE FORMAT — return ONLY valid JSON, no markdown:\n",
         "{\n",
-        "  \"tool_calls\": [{\"name\": \"query_trends\", \"arguments\": {\"niche\": \"tech\"}}],\n",
         "  \"scheduled_actions\": [\n",
-        "    {\"hour\": 12, \"action_type\": \"post\", \"content_type\": \"reel\",\n",
-        "     \"topic\": \"AI tools\", \"tags\": [\"ai\", \"coding\"], \"intent\": \"watch_bait\"}\n",
         "  ],\n",
-        "  \"replies\": [{\"post_hour\": 12, \"reply_hour\": 13}],\n",
         "  \"notes\": \"strategy notes\"\n",
         "}\n",
         "\n",
-        "RULES:\n",
-        "- content_type: reel|story|carousel|text_post\n",
-        "- intent: send_bait|save_bait|watch_bait|like_bait\n",
-        "- 1-2 posts/day optimal. More = fatigue.\n",
-        "- Empty scheduled_actions = rest (recovers energy).\n",
-        "- Vary content types and topics for diversity bonus.\n",
-        "- Reply within 90 min of post for reach bonus.\"\"\")\n",
         "\n",
         "\n",
         "def format_obs(obs):\n",
@@ -600,7 +615,7 @@
         "    tool_str = \"\"\n",
         "    for tr in getattr(obs, \"tool_results\", []):\n",
         "        if tr.success:\n",
-        "            tool_str += f\"  {tr.name}: {json.dumps(tr.data)[:200]}\\n\"\n",
         "    if not tool_str:\n",
         "        tool_str = \"  (none)\\n\"\n",
         "    return (f\"Day: {day_name} | days_elapsed={obs.days_elapsed}\\n\"\n",
@@ -633,7 +648,6 @@
         "        return ViraltestAction(\n",
         "            tool_calls=tool_calls,\n",
         "            scheduled_actions=scheduled,\n",
-        "            replies=data.get(\"replies\", []),\n",
         "            notes=data.get(\"notes\"),\n",
         "        )\n",
         "    except Exception:\n",
@@ -652,10 +666,10 @@
         "    return torch.device(\"cpu\")\n",
         "\n",
         "\n",
-        "def generate_action(mdl, tok, obs, history, temperature=0.7):\n",
         "    prompt = format_obs(obs)\n",
         "    messages = [{\"role\": \"system\", \"content\": SYSTEM_PROMPT}]\n",
-        "    messages.extend(history[-4:])\n",
         "    messages.append({\"role\": \"user\", \"content\": prompt})\n",
         "    text_input = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
         "    inputs = tok(text_input, return_tensors=\"pt\").to(_infer_model_device(mdl))\n",
@@ -663,21 +677,27 @@
         "        out = mdl.generate(**inputs, max_new_tokens=512, temperature=temperature,\n",
         "                           do_sample=True, top_p=0.9, pad_token_id=tok.eos_token_id)\n",
         "    resp = tok.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
         "    return resp, parse_model_output(resp)\n",
         "\n",
         "\n",
-        "def run_llm_episode(mdl, tok, task, seed=42, verbose=False):\n",
         "    env = ViraltestEnvironment()\n",
         "    obs = env.reset(task=task, seed=seed)\n",
         "    rewards, energies = [], [obs.creator_energy]\n",
         "    history, pairs = [], []\n",
         "    for day in range(1, TASK_HORIZON + 1):\n",
         "        if obs.done: break\n",
-        "        if obs.creator_energy <= 0.25:\n",
-        "            action = ViraltestAction(scheduled_actions=[])\n",
-        "            resp = '{\"scheduled_actions\": []}'\n",
-        "        else:\n",
-        "            resp, action = generate_action(mdl, tok, obs, history)\n",
         "        prompt = format_obs(obs)\n",
         "        pairs.append({\"prompt\": prompt, \"response\": resp})\n",
         "        obs = env.step(action)\n",
@@ -691,9 +711,17 @@
         "            print(f\"    Day {day:2d}: r={r:.4f} e={obs.creator_energy:.2f} posts={n_p} tools={len(action.tool_calls)}\")\n",
         "        if obs.done: break\n",
         "    gs = (obs.metadata or {}).get(\"grader_score\", 0.0)\n",
         "    return {\"task\": task, \"grader_score\": gs, \"total_reward\": sum(rewards),\n",
         "            \"final_energy\": obs.creator_energy, \"rewards\": rewards,\n",
-        "            \"energies\": energies, \"pairs\": pairs,\n",
         "            \"follower_delta\": obs.follower_count - 10000,\n",
         "            \"burned_out\": obs.creator_energy <= 0}\n",
         "\n",
@@ -778,8 +806,8 @@
         "from trl import SFTTrainer, SFTConfig\n",
         "from datasets import Dataset\n",
         "\n",
-        "NUM_ROUNDS = 4\n",
-        "EPISODES_PER_ROUND = 6\n",
         "TOP_K_FRACTION = 0.5\n",
         "\n",
         "training_log = {\n",
@@ -811,19 +839,21 @@
         "            text = (f\"<|im_start|>system\\n{SYSTEM_PROMPT}<|im_end|>\\n\"\n",
         "                    f\"<|im_start|>user\\n{pr['prompt']}<|im_end|>\\n\"\n",
         "                    f\"<|im_start|>assistant\\n{pr['response']}<|im_end|>\")\n",
-        "            all_pairs.append({\"text\": text, \"reward\": ep_reward})\n",
         "\n",
         "        print(f\"  ep {ep+1}/{EPISODES_PER_ROUND}: {task.split('_')[-1]:>11s} \"\n",
-        "              f\"grader={result['grader_score']:.4f} reward={ep_reward:.3f}\")\n",
         "\n",
         "    avg_r = np.mean(episode_rewards)\n",
         "    avg_g = np.mean(episode_graders)\n",
         "    print(f\"  Avg reward={avg_r:.3f} Avg grader={avg_g:.4f}\")\n",
         "\n",
-        "    # Filter to top-K\n",
         "    threshold = np.percentile([p[\"reward\"] for p in all_pairs], (1 - TOP_K_FRACTION) * 100)\n",
         "    filtered = [p for p in all_pairs if p[\"reward\"] >= threshold] or all_pairs\n",
-        "    print(f\"  Filtered to {len(filtered)}/{len(all_pairs)} samples\")\n",
         "\n",
         "    dataset = Dataset.from_list([{\"text\": p[\"text\"]} for p in filtered])\n",
         "\n",
@@ -831,14 +861,18 @@
         "    sft_config = SFTConfig(\n",
         "        output_dir=f\"./checkpoints/round_{round_idx}\",\n",
         "        num_train_epochs=2,\n",
-        "        per_device_train_batch_size=1,\n",
-        "        gradient_accumulation_steps=4,\n",
         "        learning_rate=2e-5,\n",
-        "        warmup_steps=5,\n",
-        "        logging_steps=5,\n",
         "        save_strategy=\"no\",\n",
         "        max_length=1024,\n",
-        "        fp16=True,\n",
         "        report_to=\"none\",\n",
         "    )\n",
         "\n",
@@ -1082,7 +1116,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.14.2"
     }
   },
   "nbformat": 4,

         "                topic=ALL_TOPICS[(day*2+1)%len(ALL_TOPICS)],\n",
         "                tags=[TAG_POOL[(day*6+3+i)%len(TAG_POOL)] for i in range(3)],\n",
         "                intent=INTENTS[(day*2+1)%4]),\n",
+        "        ])\n",
         "\n",
         "BASELINE_AGENTS = {\n",
         "    \"always_rest\": plan_always_rest, \"spam\": plan_spam,\n",
         "\n",
         "RESPONSE FORMAT — return ONLY valid JSON, no markdown:\n",
         "{\n",
+        "  \"tool_calls\": [{\"name\": \"<tool>\", \"arguments\": {...}}],\n",
         "  \"scheduled_actions\": [\n",
+        "    {\"hour\": 0-23, \"action_type\": \"post|create_content\",\n",
+        "     \"content_type\": \"reel|story|carousel|text_post\",\n",
+        "     \"topic\": \"<string>\", \"tags\": [\"...\"],\n",
+        "     \"intent\": \"send_bait|save_bait|watch_bait|like_bait\"}\n",
         "  ],\n",
         "  \"notes\": \"strategy notes\"\n",
         "}\n",
         "\n",
+        "TOOLS (cost in API budget, total=100):\n",
+        "- query_trends(niche)                            cost=1  trending topics+tags for niche\n",
+        "- query_audience(segment_id)                     cost=2  segment topic affinities + active hours\n",
+        "- query_competitor(competitor_id, window_days)   cost=2  competitor recent posts\n",
+        "- query_tag_history(tag)                         cost=1  your past signals (watch/sends/saves/likes) for a tag\n",
+        "- predict_engagement(scheduled_actions)          cost=3  simulate a plan WITHOUT committing\n",
+        "- draft_review(scheduled_actions)                cost=3  AI review of a draft plan\n",
+        "- query_creator_pool()                           cost=1  list collab partners with audience overlap\n",
+        "- propose_collab(partner_id, content_type, hour) cost=5  co-author the post at that hour (max 2/month)\n",
+        "\n",
+        "ACTION SCHEMA:\n",
+        "- hour:         0..23 (unlisted hours = rest)\n",
+        "- action_type:  post (publish) | create_content (build queue, no publish)\n",
+        "- content_type: reel | story | carousel | text_post\n",
+        "- intent:       which Mosseri signal the post optimises for\n",
+        "                  send_bait  -> DM shares (strongest discovery signal)\n",
+        "                  save_bait  -> bookmarks (content quality)\n",
+        "                  watch_bait -> reels watch time\n",
+        "                  like_bait  -> likes from existing followers\n",
+        "- tags:         up to 5 hashtags\n",
+        "- topic:        free-form string\n",
+        "- empty scheduled_actions = full day rest\"\"\")\n",
         "\n",
         "\n",
         "def format_obs(obs):\n",
         "    tool_str = \"\"\n",
         "    for tr in getattr(obs, \"tool_results\", []):\n",
         "        if tr.success:\n",
+        "            tool_str += f\"  {tr.name}: {json.dumps(tr.data)}\\n\"\n",
         "    if not tool_str:\n",
         "        tool_str = \"  (none)\\n\"\n",
         "    return (f\"Day: {day_name} | days_elapsed={obs.days_elapsed}\\n\"\n",
         "        return ViraltestAction(\n",
         "            tool_calls=tool_calls,\n",
         "            scheduled_actions=scheduled,\n",
         "            notes=data.get(\"notes\"),\n",
         "        )\n",
         "    except Exception:\n",
         "    return torch.device(\"cpu\")\n",
         "\n",
         "\n",
+        "def generate_action(mdl, tok, obs, history, temperature=0.7, debug=True):\n",
         "    prompt = format_obs(obs)\n",
         "    messages = [{\"role\": \"system\", \"content\": SYSTEM_PROMPT}]\n",
+        "    messages.extend(history[-14:])\n",
         "    messages.append({\"role\": \"user\", \"content\": prompt})\n",
         "    text_input = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
         "    inputs = tok(text_input, return_tensors=\"pt\").to(_infer_model_device(mdl))\n",
         "        out = mdl.generate(**inputs, max_new_tokens=512, temperature=temperature,\n",
         "                           do_sample=True, top_p=0.9, pad_token_id=tok.eos_token_id)\n",
         "    resp = tok.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
+        "    if debug:\n",
+        "        print(\"=\" * 60)\n",
+        "        print(f\"[LLM PROMPT] tokens={inputs['input_ids'].shape[1]}\")\n",
+        "        print(prompt)\n",
+        "        print(\"-\" * 60)\n",
+        "        print(f\"[LLM RESPONSE] tokens={out.shape[1] - inputs['input_ids'].shape[1]}\")\n",
+        "        print(resp)\n",
+        "        print(\"=\" * 60)\n",
         "    return resp, parse_model_output(resp)\n",
         "\n",
         "\n",
+        "def run_llm_episode(mdl, tok, task, seed=42, verbose=False, debug_llm=True):\n",
         "    env = ViraltestEnvironment()\n",
         "    obs = env.reset(task=task, seed=seed)\n",
         "    rewards, energies = [], [obs.creator_energy]\n",
         "    history, pairs = [], []\n",
         "    for day in range(1, TASK_HORIZON + 1):\n",
         "        if obs.done: break\n",
+        "        if debug_llm:\n",
+        "            print(f\"\\n>>> Day {day} | task={task} | energy={obs.creator_energy:.2f}\")\n",
+        "        resp, action = generate_action(mdl, tok, obs, history, debug=debug_llm)\n",
         "        prompt = format_obs(obs)\n",
         "        pairs.append({\"prompt\": prompt, \"response\": resp})\n",
         "        obs = env.step(action)\n",
         "            print(f\"    Day {day:2d}: r={r:.4f} e={obs.creator_energy:.2f} posts={n_p} tools={len(action.tool_calls)}\")\n",
         "        if obs.done: break\n",
         "    gs = (obs.metadata or {}).get(\"grader_score\", 0.0)\n",
+        "    # Per-step credit assignment: G_t = r_t + gamma * G_{t+1}, terminal = grader_score * w\n",
+        "    GAMMA, TERMINAL_W = 0.95, 5.0\n",
+        "    G, returns = gs * TERMINAL_W, [0.0] * len(rewards)\n",
+        "    for t in reversed(range(len(rewards))):\n",
+        "        G = rewards[t] + GAMMA * G\n",
+        "        returns[t] = G\n",
+        "    for i, pr in enumerate(pairs):\n",
+        "        pr[\"return\"] = returns[i] if i < len(returns) else 0.0\n",
         "    return {\"task\": task, \"grader_score\": gs, \"total_reward\": sum(rewards),\n",
         "            \"final_energy\": obs.creator_energy, \"rewards\": rewards,\n",
+        "            \"returns\": returns, \"energies\": energies, \"pairs\": pairs,\n",
         "            \"follower_delta\": obs.follower_count - 10000,\n",
         "            \"burned_out\": obs.creator_energy <= 0}\n",
         "\n",
         "from trl import SFTTrainer, SFTConfig\n",
         "from datasets import Dataset\n",
         "\n",
+        "NUM_ROUNDS = 1\n",
+        "EPISODES_PER_ROUND = 1\n",
         "TOP_K_FRACTION = 0.5\n",
         "\n",
         "training_log = {\n",
         "            text = (f\"<|im_start|>system\\n{SYSTEM_PROMPT}<|im_end|>\\n\"\n",
         "                    f\"<|im_start|>user\\n{pr['prompt']}<|im_end|>\\n\"\n",
         "                    f\"<|im_start|>assistant\\n{pr['response']}<|im_end|>\")\n",
+        "            all_pairs.append({\"text\": text, \"reward\": pr[\"return\"]})\n",
         "\n",
+        "        rets = result[\"returns\"]\n",
         "        print(f\"  ep {ep+1}/{EPISODES_PER_ROUND}: {task.split('_')[-1]:>11s} \"\n",
+        "              f\"grader={result['grader_score']:.4f} reward={ep_reward:.3f} \"\n",
+        "              f\"return[min={min(rets):.2f} max={max(rets):.2f} mean={np.mean(rets):.2f}]\")\n",
         "\n",
         "    avg_r = np.mean(episode_rewards)\n",
         "    avg_g = np.mean(episode_graders)\n",
         "    print(f\"  Avg reward={avg_r:.3f} Avg grader={avg_g:.4f}\")\n",
         "\n",
+        "    # Filter to top-K by per-pair return (per-step credit assignment)\n",
         "    threshold = np.percentile([p[\"reward\"] for p in all_pairs], (1 - TOP_K_FRACTION) * 100)\n",
         "    filtered = [p for p in all_pairs if p[\"reward\"] >= threshold] or all_pairs\n",
+        "    print(f\"  Filtered to {len(filtered)}/{len(all_pairs)} samples (return >= {threshold:.3f})\")\n",
         "\n",
         "    dataset = Dataset.from_list([{\"text\": p[\"text\"]} for p in filtered])\n",
         "\n",
         "    sft_config = SFTConfig(\n",
         "        output_dir=f\"./checkpoints/round_{round_idx}\",\n",
         "        num_train_epochs=2,\n",
+        "        per_device_train_batch_size=32,\n",
+        "        gradient_accumulation_steps=1,\n",
         "        learning_rate=2e-5,\n",
+        "        warmup_ratio=0.1,\n",
+        "        logging_steps=1,\n",
         "        save_strategy=\"no\",\n",
         "        max_length=1024,\n",
+        "        bf16=True,\n",
+        "        gradient_checkpointing=False,\n",
+        "        dataloader_num_workers=4,\n",
+        "        dataloader_pin_memory=True,\n",
+        "        optim=\"adamw_torch_fused\",\n",
         "        report_to=\"none\",\n",
         "    )\n",
         "\n",
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
+      "version": "3.13.1"
     }
   },
   "nbformat": 4,