Spaces:

kushalExplores
/

flatmate_rl

Sleeping

App Files Files Community

kushalExplores commited on 11 days ago

Commit

dbb1ce2

verified ·

1 Parent(s): 0594d27

Add step-2 GRPO notebook and hidden-flex fix

Browse files

Files changed (5) hide show

inference.py +32 -0
server/episode.py +49 -7
tests/test_flatmate_rl.py +16 -1
tests/test_reward_regression.py +1 -1
train_flatmate_rl_grpo_step2.ipynb +205 -0

inference.py CHANGED Viewed

@@ -71,6 +71,30 @@ SYSTEM_PROMPT = textwrap.dedent(
     - If a tool can perform the next required operation, call the tool immediately.
     - Do not send acknowledgement or progress messages such as "I will search now" when a tool call is needed.
     - Prefer safe, incremental progress toward storing user details, matching listings, and booking visits.
     """
 ).strip()
@@ -371,12 +395,20 @@ def build_user_prompt(step: int, observation: Any) -> str:
         Step: {step}
         Phase: {observation.phase}
         Status: {observation.status}
         Available tools: {observation.available_tools}
         Last tool result: {json.dumps(last_tool_result, ensure_ascii=False)}
         Prerequisites satisfied: {json.dumps(observation.prerequisites_satisfied, ensure_ascii=False)}
         Recent tool calls: {json.dumps(observation.recent_tool_calls, ensure_ascii=False)}
         Booked visits: {observation.booked_visits}
         Buyer/Broker transcript:
         {json.dumps(observation.buyer_conversation_history[-8:], ensure_ascii=False)}

     - If a tool can perform the next required operation, call the tool immediately.
     - Do not send acknowledgement or progress messages such as "I will search now" when a tool call is needed.
     - Prefer safe, incremental progress toward storing user details, matching listings, and booking visits.
+    - Use exact tool argument names from the prompt. Never invent aliases such as visit_time.
+    - Treat negative reward, violations, and feedback_summary as corrective feedback for the next action.
+    """
+).strip()
+TOOL_CONTRACT_PROMPT = textwrap.dedent(
+    """
+    Tool argument contract:
+    - store_user_details: tool_arguments can be {} after required buyer fields are gathered.
+    - search_posts: tool_arguments can be {}.
+    - match_location_preference: {"post_ids":["post_id", ...]}.
+    - get_commute_time: {"post_ids":["post_id", ...]}.
+    - check_calendar_slots: {"post_ids":["post_id", ...]}.
+    - shortlist: {"post_ids":["post_id", ...]}.
+    - contact_poster: {"post_id":"post_id","time_text":"exact slot from check_calendar_slots"}. This shares the buyer profile with the seller/poster and asks them to confirm profile fit plus visit time.
+    - book_viewing: {"post_id":"post_id","time_text":"same exact slot confirmed by buyer and poster"}.
+    Booking workflow:
+    1. Ask for missing buyer fields before store_user_details.
+    2. Store buyer details, search posts, match location, get commute time, then check calendar slots.
+    3. Ask the buyer to confirm one exact slot from check_calendar_slots.
+    4. Call contact_poster with post_id and time_text for that same slot.
+    5. Only after buyer_confirmed and poster_confirmed are true, call book_viewing with post_id and time_text.
     """
 ).strip()
         Step: {step}
         Phase: {observation.phase}
         Status: {observation.status}
+        Feedback summary: {observation.feedback_summary}
+        Environment message: {observation.message}
+        Step reward: {observation.step_reward}
+        Total reward: {observation.total_reward}
+        Violations: {observation.violations}
+        Remaining required fields: {observation.remaining_required_fields}
         Available tools: {observation.available_tools}
         Last tool result: {json.dumps(last_tool_result, ensure_ascii=False)}
         Prerequisites satisfied: {json.dumps(observation.prerequisites_satisfied, ensure_ascii=False)}
         Recent tool calls: {json.dumps(observation.recent_tool_calls, ensure_ascii=False)}
         Booked visits: {observation.booked_visits}
+        {TOOL_CONTRACT_PROMPT}
         Buyer/Broker transcript:
         {json.dumps(observation.buyer_conversation_history[-8:], ensure_ascii=False)}

server/episode.py CHANGED Viewed

@@ -95,6 +95,7 @@ class FlatmateEpisode:
         self._commutes_checked: dict[str, int] = {}
         self._poster_confirmations: dict[str, str] = {}
         self._client_confirmations: dict[str, str] = {}
         self._seller_confirmations: dict[str, str] = {}
         self._buyer_offer_confirmations: dict[str, str] = {}
         self._dynamic_post_id: str | None = None
@@ -136,6 +137,7 @@ class FlatmateEpisode:
         self._commutes_checked = {}
         self._poster_confirmations = {}
         self._client_confirmations = {}
         self._seller_confirmations = {}
         self._buyer_offer_confirmations = {}
         self._dynamic_post_id = None
@@ -259,6 +261,12 @@ class FlatmateEpisode:
                 slots.extend(profile["hidden_additional_availability"])
         return slots
     def _record_violation(self, text: str) -> None:
         if text not in self._violations:
             self._violations.append(text)
@@ -496,8 +504,10 @@ class FlatmateEpisode:
                 self._state.gathered_fields.append("hidden_flex_revealed")
             if alternatives_offered:
                 if "sunday 5pm" in lowered:
                     return "I can make Sunday 5pm work, so I confirm Sunday 5pm."
                 if "saturday 1pm" in lowered:
                     return "Saturday 1pm works for me too, so I confirm Saturday 1pm."
         # Scenario 2: waitlist — fire cancellation notification on first message after add_to_waitlist
@@ -861,6 +871,16 @@ class FlatmateEpisode:
         self._state.selected_posts = post_ids
         return {"tool": "shortlist", "success": True, "message": "Posts shortlisted.", "selected_posts": post_ids}
     def _tool_contact_poster(self, arguments: dict[str, Any]) -> dict[str, Any]:
         post_id = arguments.get("post_id", "")
         time_text = arguments.get("time_text", "")
@@ -872,20 +892,34 @@ class FlatmateEpisode:
             return {"tool": "contact_poster", "success": False, "message": "Time must come from check_calendar_slots."}
         self._seller_history.append(
             {
-                "role": "user",
-                "content": f"Client selected {post_id}. Can we visit at {time_text}?",
             }
         )
         self._poster_confirmations[post_id] = time_text
-        poster_message = f"Yes, confirmed. {time_text} works for the visit."
-        self._seller_history.append({"role": "assistant", "content": poster_message})
-        return {"tool": "contact_poster", "success": True, "message": f"Poster confirmed {time_text}.", "post_id": post_id, "time_text": time_text}
     def _tool_book_viewing(self, arguments: dict[str, Any]) -> dict[str, Any]:
         post_id = arguments.get("post_id", "")
         time_text = arguments.get("time_text", "")
         if post_id not in self._poster_confirmations or self._poster_confirmations[post_id] != time_text:
             return {"tool": "book_viewing", "success": False, "message": "Poster has not explicitly confirmed this time."}
         if post_id not in self._client_confirmations or self._client_confirmations[post_id] != time_text:
             return {"tool": "book_viewing", "success": False, "message": "Client has not explicitly confirmed this time."}
         if self._scenario["task_id"] == "task_visit_multi" and post_id not in self._state.selected_posts:
@@ -940,8 +974,15 @@ class FlatmateEpisode:
         config = self._scenario["scenario_creation_config"].get("negotiation_config", {})
         seller_floor = config.get("seller_floor", 0)
         self._negotiation_rounds_seller += 1
         if proposed_rent >= seller_floor:
             self._seller_price_accepted = proposed_rent
             return {
                 "tool": "propose_price_to_seller",
                 "success": True,
@@ -950,6 +991,7 @@ class FlatmateEpisode:
                 "proposed_rent": proposed_rent,
             }
         hint = " Maybe a small discount is possible." if self._negotiation_rounds_seller >= 2 else ""
         return {
             "tool": "propose_price_to_seller",
             "success": True,
@@ -1167,9 +1209,9 @@ class FlatmateEpisode:
         post = self._resolve_post(post_id)
         if not post or time_text not in post["calendar_slots"]:
             return {"tool": "confirm_seller_match", "success": False, "message": "Selected seller slot is invalid."}
-        self._seller_history.append({"role": "user", "content": f"Can we confirm {time_text} for {post_id}?"})
         self._seller_confirmations[post_id] = time_text
-        self._seller_history.append({"role": "assistant", "content": f"Confirmed, {time_text} works from the seller side."})
         return {"tool": "confirm_seller_match", "success": True, "message": f"Seller confirmed {time_text}.", "post_id": post_id, "time_text": time_text}
     def _tool_offer_matched_listing_to_buyer(self, arguments: dict[str, Any]) -> dict[str, Any]:

         self._commutes_checked: dict[str, int] = {}
         self._poster_confirmations: dict[str, str] = {}
         self._client_confirmations: dict[str, str] = {}
+        self._seller_profile_fit_confirmations: dict[str, bool] = {}
         self._seller_confirmations: dict[str, str] = {}
         self._buyer_offer_confirmations: dict[str, str] = {}
         self._dynamic_post_id: str | None = None
         self._commutes_checked = {}
         self._poster_confirmations = {}
         self._client_confirmations = {}
+        self._seller_profile_fit_confirmations = {}
         self._seller_confirmations = {}
         self._buyer_offer_confirmations = {}
         self._dynamic_post_id = None
                 slots.extend(profile["hidden_additional_availability"])
         return slots
+    def _record_client_confirmation_for_slot(self, slot: str) -> None:
+        for post_id, checked_slots in self._slots_checked.items():
+            if slot in checked_slots:
+                self._client_confirmations[post_id] = slot
+                return
     def _record_violation(self, text: str) -> None:
         if text not in self._violations:
             self._violations.append(text)
                 self._state.gathered_fields.append("hidden_flex_revealed")
             if alternatives_offered:
                 if "sunday 5pm" in lowered:
+                    self._record_client_confirmation_for_slot("Sunday 5pm")
                     return "I can make Sunday 5pm work, so I confirm Sunday 5pm."
                 if "saturday 1pm" in lowered:
+                    self._record_client_confirmation_for_slot("Saturday 1pm")
                     return "Saturday 1pm works for me too, so I confirm Saturday 1pm."
         # Scenario 2: waitlist — fire cancellation notification on first message after add_to_waitlist
         self._state.selected_posts = post_ids
         return {"tool": "shortlist", "success": True, "message": "Posts shortlisted.", "selected_posts": post_ids}
+    def _buyer_profile_summary_for_seller(self) -> str:
+        profile = self._scenario["buyer_profile"]
+        return (
+            f"buyer profile: budget up to Rs. {profile['budget_max']}; "
+            f"dietary preference {profile['dietary']}; "
+            f"preferred areas {', '.join(profile['areas'])}; "
+            f"occupation {profile['occupation']}; "
+            f"visit availability {', '.join(profile['visit_availability'])}"
+        )
     def _tool_contact_poster(self, arguments: dict[str, Any]) -> dict[str, Any]:
         post_id = arguments.get("post_id", "")
         time_text = arguments.get("time_text", "")
             return {"tool": "contact_poster", "success": False, "message": "Time must come from check_calendar_slots."}
         self._seller_history.append(
             {
+                "role": "assistant",
+                "content": (
+                    f"Client selected {post_id}. Please review this {self._buyer_profile_summary_for_seller()}. "
+                    f"Can you confirm the buyer profile is acceptable and that we can visit at {time_text}?"
+                ),
             }
         )
         self._poster_confirmations[post_id] = time_text
+        self._seller_profile_fit_confirmations[post_id] = True
+        poster_message = f"Yes, confirmed. The buyer profile is acceptable and {time_text} works for the visit."
+        self._seller_history.append({"role": "user", "content": poster_message})
+        return {
+            "tool": "contact_poster",
+            "success": True,
+            "message": f"Poster confirmed buyer profile fit and {time_text}.",
+            "post_id": post_id,
+            "time_text": time_text,
+            "buyer_profile_shared": True,
+            "seller_profile_fit_confirmed": True,
+        }
     def _tool_book_viewing(self, arguments: dict[str, Any]) -> dict[str, Any]:
         post_id = arguments.get("post_id", "")
         time_text = arguments.get("time_text", "")
         if post_id not in self._poster_confirmations or self._poster_confirmations[post_id] != time_text:
             return {"tool": "book_viewing", "success": False, "message": "Poster has not explicitly confirmed this time."}
+        if not self._seller_profile_fit_confirmations.get(post_id):
+            return {"tool": "book_viewing", "success": False, "message": "Poster has not confirmed the buyer profile fit."}
         if post_id not in self._client_confirmations or self._client_confirmations[post_id] != time_text:
             return {"tool": "book_viewing", "success": False, "message": "Client has not explicitly confirmed this time."}
         if self._scenario["task_id"] == "task_visit_multi" and post_id not in self._state.selected_posts:
         config = self._scenario["scenario_creation_config"].get("negotiation_config", {})
         seller_floor = config.get("seller_floor", 0)
         self._negotiation_rounds_seller += 1
+        self._seller_history.append(
+            {
+                "role": "assistant",
+                "content": f"The buyer is interested in {post_id}. Would you accept Rs. {proposed_rent}?",
+            }
+        )
         if proposed_rent >= seller_floor:
             self._seller_price_accepted = proposed_rent
+            self._seller_history.append({"role": "user", "content": f"Yes, I can accept Rs. {proposed_rent}."})
             return {
                 "tool": "propose_price_to_seller",
                 "success": True,
                 "proposed_rent": proposed_rent,
             }
         hint = " Maybe a small discount is possible." if self._negotiation_rounds_seller >= 2 else ""
+        self._seller_history.append({"role": "user", "content": f"I can't go as low as Rs. {proposed_rent}.{hint}"})
         return {
             "tool": "propose_price_to_seller",
             "success": True,
         post = self._resolve_post(post_id)
         if not post or time_text not in post["calendar_slots"]:
             return {"tool": "confirm_seller_match", "success": False, "message": "Selected seller slot is invalid."}
+        self._seller_history.append({"role": "assistant", "content": f"Can we confirm {time_text} for {post_id}?"})
         self._seller_confirmations[post_id] = time_text
+        self._seller_history.append({"role": "user", "content": f"Confirmed, {time_text} works from the seller side."})
         return {"tool": "confirm_seller_match", "success": True, "message": f"Seller confirmed {time_text}.", "post_id": post_id, "time_text": time_text}
     def _tool_offer_matched_listing_to_buyer(self, arguments: dict[str, Any]) -> dict[str, Any]:

tests/test_flatmate_rl.py CHANGED Viewed

@@ -200,8 +200,17 @@ def test_single_visit_scenario_books_one_visit() -> None:
     assert final_obs.done is True
     assert final_obs.booked_visits == [{"post_id": "post_023", "time": "Saturday 11am"}]
     assert len(final_obs.seller_conversation_history) >= 2
-    assert "Can we visit at Saturday 11am" in final_obs.seller_conversation_history[0]["content"]
     assert "Saturday 11am works for the visit" in final_obs.seller_conversation_history[1]["content"]
 def test_buyer_answers_diet_and_availability_when_broker_asks_for_both() -> None:
@@ -430,6 +439,10 @@ def test_hidden_flex_requires_alternative_slot_to_unlock_backup_availability() -
     obs = _msg(env, "No Tuesday slot matches. I can offer Saturday 1pm or Sunday 5pm instead.")
     assert "confirm" in obs.last_user_message.lower()
     assert "Sunday 5pm" in obs.last_user_message or "Saturday 1pm" in obs.last_user_message
 def test_multi_visit_scenario_books_two_visits() -> None:
@@ -533,3 +546,5 @@ def test_negotiation_heuristic_confirms_deal_with_agreed_rent() -> None:
     assert obs.status == "completed"
     assert obs.booked_visits == [{"post_id": "post_155", "time": "negotiated_deal", "agreed_rent": 21000}]
     assert obs.last_tool_result["tool"] == "confirm_negotiated_deal"

     assert final_obs.done is True
     assert final_obs.booked_visits == [{"post_id": "post_023", "time": "Saturday 11am"}]
     assert len(final_obs.seller_conversation_history) >= 2
+    assert final_obs.seller_conversation_history[0]["role"] == "assistant"
+    assert final_obs.seller_conversation_history[1]["role"] == "user"
+    assert "buyer profile" in final_obs.seller_conversation_history[0]["content"]
+    assert "budget up to Rs. 20000" in final_obs.seller_conversation_history[0]["content"]
+    assert "Can you confirm the buyer profile is acceptable" in final_obs.seller_conversation_history[0]["content"]
+    assert "Saturday 11am" in final_obs.seller_conversation_history[0]["content"]
+    assert "buyer profile is acceptable" in final_obs.seller_conversation_history[1]["content"]
     assert "Saturday 11am works for the visit" in final_obs.seller_conversation_history[1]["content"]
+    contact_result = next(result for result in final_obs.tool_results if result["tool"] == "contact_poster")
+    assert contact_result["buyer_profile_shared"] is True
+    assert contact_result["seller_profile_fit_confirmed"] is True
 def test_buyer_answers_diet_and_availability_when_broker_asks_for_both() -> None:
     obs = _msg(env, "No Tuesday slot matches. I can offer Saturday 1pm or Sunday 5pm instead.")
     assert "confirm" in obs.last_user_message.lower()
     assert "Sunday 5pm" in obs.last_user_message or "Saturday 1pm" in obs.last_user_message
+    _tool(env, "contact_poster", post_id="post_023", time_text="Sunday 5pm")
+    obs = _tool(env, "book_viewing", post_id="post_023", time_text="Sunday 5pm")
+    assert obs.done is True
+    assert obs.booked_visits == [{"post_id": "post_023", "time": "Sunday 5pm"}]
 def test_multi_visit_scenario_books_two_visits() -> None:
     assert obs.status == "completed"
     assert obs.booked_visits == [{"post_id": "post_155", "time": "negotiated_deal", "agreed_rent": 21000}]
     assert obs.last_tool_result["tool"] == "confirm_negotiated_deal"
+    assert any("Would you accept Rs. 21000" in item["content"] for item in obs.seller_conversation_history)
+    assert any("I can accept Rs. 21000" in item["content"] for item in obs.seller_conversation_history)

tests/test_reward_regression.py CHANGED Viewed

@@ -9,7 +9,7 @@ from flatmate_rl.server.heuristic_policy import expected_policy_action
 HEURISTIC_BASELINES = {
     "task_visit_single": 0.70,
-    "task_visit_single_hidden_flex": -1.70,
     "task_visit_multi": 1.10,
     "task_visit_single_seller_followup": 0.90,
 }

 HEURISTIC_BASELINES = {
     "task_visit_single": 0.70,
+    "task_visit_single_hidden_flex": 0.90,
     "task_visit_multi": 1.10,
     "task_visit_single_seller_followup": 0.90,
 }

train_flatmate_rl_grpo_step2.ipynb ADDED Viewed

	@@ -0,0 +1,205 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Flatmate RL GRPO Step-2 Curriculum\n",
+        "\n",
+        "This notebook is a minimal GRPO starter for `flatmate_rl`.\n",
+        "It only trains the first two workflow steps:\n",
+        "\n",
+        "1. ask for the missing buyer details\n",
+        "2. store the buyer profile\n",
+        "\n",
+        "The goal is to keep the reward simple enough to bootstrap the broker policy before training on later booking steps."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%pip install -q trl transformers accelerate datasets peft bitsandbytes sentencepiece\n",
+        "\n",
+        "from __future__ import annotations\n",
+        "\n",
+        "import json\n",
+        "import sys\n",
+        "from pathlib import Path\n",
+        "\n",
+        "repo_root = Path.cwd().resolve().parent\n",
+        "if str(repo_root) not in sys.path:\n",
+        "    sys.path.insert(0, str(repo_root))\n",
+        "\n",
+        "from datasets import Dataset\n",
+        "from flatmate_rl import FlatmateRlAction\n",
+        "from flatmate_rl.server.flatmate_rl_environment import FlatmateRlEnvironment\n",
+        "from flatmate_rl.server.heuristic_policy import expected_policy_action\n",
+        "\n",
+        "print('imports ready')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "TARGET_SCENARIOS = [\n",
+        "    'task_visit_single',\n",
+        "    'task_visit_single_hidden_flex',\n",
+        "    'task_visit_multi',\n",
+        "    'task_visit_single_seller_followup',\n",
+        "]\n",
+        "\n",
+        "def format_prompt(obs, step: int) -> str:\n",
+        "    visible_state = {\n",
+        "        'step': step,\n",
+        "        'phase': obs.phase,\n",
+        "        'status': obs.status,\n",
+        "        'remaining_required_fields': obs.remaining_required_fields,\n",
+        "        'available_tools': obs.available_tools,\n",
+        "        'feedback_summary': obs.feedback_summary,\n",
+        "        'message': obs.message,\n",
+        "        'last_tool_result': obs.last_tool_result,\n",
+        "        'buyer_history': obs.buyer_conversation_history[-4:],\n",
+        "        'seller_history': obs.seller_conversation_history[-4:],\n",
+        "    }\n",
+        "\n",
+        "    return (\n",
+        "        'Return exactly one JSON object.\\\\n'\n",
+        "        'Schema: {\"action_type\":\"assistant_message\",\"assistant_message\":\"...\"} or '\n",
+        "        '{\"action_type\":\"tool_call\",\"tool_name\":\"...\",\"tool_arguments\":{...}}\\\\n\\\\n'\n",
+        "        f'Observation:\\n{json.dumps(visible_state, ensure_ascii=False, indent=2)}\\n'\n",
+        "        'Return JSON only.'\n",
+        "    )\n",
+        "\n",
+        "rows = []\n",
+        "for scenario_id in TARGET_SCENARIOS:\n",
+        "    env = FlatmateRlEnvironment()\n",
+        "    obs = env.reset(scenario_id=scenario_id)\n",
+        "    for step in (1, 2):\n",
+        "        payload = expected_policy_action(scenario_id, obs.model_dump())\n",
+        "        if payload is None:\n",
+        "            break\n",
+        "        rows.append(\n",
+        "            {\n",
+        "                'scenario_id': scenario_id,\n",
+        "                'step': step,\n",
+        "                'prompt': format_prompt(obs, step),\n",
+        "                'expected_action': payload,\n",
+        "            }\n",
+        "        )\n",
+        "        obs = env.step(FlatmateRlAction.model_validate(payload))\n",
+        "\n",
+        "train_ds = Dataset.from_list(rows)\n",
+        "train_ds[:2]\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def score_completion(example, completion_text: str) -> float:\n",
+        "    try:\n",
+        "        action = json.loads(completion_text)\n",
+        "    except json.JSONDecodeError:\n",
+        "        return -0.25\n",
+        "\n",
+        "    step = int(example['step'])\n",
+        "    expected = example['expected_action']\n",
+        "\n",
+        "    if step == 1:\n",
+        "        message = str(action.get('assistant_message', '')).lower()\n",
+        "        if action.get('action_type') == 'assistant_message' and 'diet' in message and 'availability' in message:\n",
+        "            return 1.0\n",
+        "        return -0.1\n",
+        "\n",
+        "    if step == 2:\n",
+        "        if action.get('action_type') == 'tool_call' and action.get('tool_name') == expected.get('tool_name'):\n",
+        "            return 1.0\n",
+        "        return -0.2\n",
+        "\n",
+        "    return 0.0\n",
+        "\n",
+        "for row in rows[:2]:\n",
+        "    print(row['scenario_id'], row['step'], score_completion(row, json.dumps(row['expected_action'])))\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+        "\n",
+        "model_name = 'Qwen/Qwen2.5-0.5B-Instruct'\n",
+        "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+        "model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')\n",
+        "\n",
+        "from peft import LoraConfig\n",
+        "from trl import GRPOConfig, GRPOTrainer\n",
+        "\n",
+        "grpo_args = GRPOConfig(\n",
+        "    output_dir='flatmate_grpo_step2',\n",
+        "    learning_rate=1e-5,\n",
+        "    per_device_train_batch_size=1,\n",
+        "    gradient_accumulation_steps=4,\n",
+        "    max_prompt_length=1024,\n",
+        "    max_completion_length=256,\n",
+        "    num_generations=4,\n",
+        "    logging_steps=1,\n",
+        "    save_steps=25,\n",
+        ")\n",
+        "\n",
+        "lora_config = LoraConfig(\n",
+        "    r=8,\n",
+        "    lora_alpha=16,\n",
+        "    lora_dropout=0.05,\n",
+        "    bias='none',\n",
+        "    task_type='CAUSAL_LM',\n",
+        ")\n",
+        "\n",
+        "def reward_func(prompts, completions, **kwargs):\n",
+        "    rewards = []\n",
+        "    examples = kwargs['examples']\n",
+        "    for example, completion in zip(examples, completions):\n",
+        "        rewards.append(score_completion(example, completion))\n",
+        "    return rewards\n",
+        "\n",
+        "# Starter training block.\n",
+        "# If your installed TRL version expects a slightly different GRPOTrainer signature,\n",
+        "# keep the dataset, reward, and LoRA config from above and adapt only the constructor call.\n",
+        "trainer = GRPOTrainer(\n",
+        "    model=model,\n",
+        "    tokenizer=tokenizer,\n",
+        "    args=grpo_args,\n",
+        "    train_dataset=train_ds,\n",
+        "    reward_funcs=[reward_func],\n",
+        "    peft_config=lora_config,\n",
+        ")\n",
+        "\n",
+        "# trainer.train()\n",
+        "print('GRPO trainer configured for the step-1/step-2 curriculum')\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}