rtferraz commited on
Commit
631e559
Β·
verified Β·
1 Parent(s): 5aa00ff

v4 notebook: fix TypeError crash, suppress warnings, update paths to CWD, add V3 task-aware system prompts

Browse files
Files changed (1) hide show
  1. notebooks/v4_instruct_grpo.ipynb +5 -5
notebooks/v4_instruct_grpo.ipynb CHANGED
@@ -39,7 +39,7 @@
39
  "execution_count": null,
40
  "metadata": {},
41
  "outputs": [],
42
- "source": "import os\nimport json\nimport re\nimport time\nimport random\nimport gc\nfrom pathlib import Path\n\n# ── Disable Unsloth kernel recompilation ─────────────────────────────────────\nos.environ[\"UNSLOTH_COMPILE_DISABLE\"] = \"1\"\nos.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n\n# ── Model ────────────────────────────────────────────────────────────────────\nMODEL_ID = \"Polygl0t/Tucano2-qwen-0.5B-Instruct\"\nMAX_SEQ_LENGTH = 2048 # model supports 4096, but 2048 is plenty for Instruct (no <think> overhead)\nADAPTER_DIR = Path(\"models/tucano2-0.5B-instruct-grpo-v4\")\nCHECKPOINT_DIR = ADAPTER_DIR / \"checkpoints\"\n\n# ── Data ─────────────────────────────────────────────────────────────────────\nDATA_DIR = Path(\"data/pairs\")\nTRAIN_FILE = DATA_DIR / \"train.jsonl\"\nEVAL_SPLIT = 0.10 # 10% held out for eval\n\n# ── GRPO Hyperparameters ─────────────────────────────────────────────────────\nNUM_GENERATIONS = 16 # 0.5B + short completions = VRAM allows G=16\nMAX_COMPLETION_LENGTH = 512 # Instruct: no <think> overhead. Extraction ~100, SQL ~200, insights ~300\nTEMPERATURE = 1.0 # Skywork-OR1: Ο„=1.0 for exploration\nLEARNING_RATE = 2e-6 # Dr. GRPO: 4Γ— V2's 5e-7 (clip_ratio=0 β†’ push harder)\nBETA = 0.0 # Dr. GRPO Β§3.2: Ξ²=0 optimal for rule-based rewards\nSCALE_REWARDS = False # Dr. GRPO: remove std normalization bias\nBATCH_SIZE = 2 # per-device batch size\nGRAD_ACCUM = 1 # effective batch = 2 * 1 = 2 prompts * 16 gen = 32 completions\nMAX_STEPS = 200 # validation run\nSAVE_STEPS = 20\nEVAL_STEPS = 10\nEARLY_STOPPING_PATIENCE = 15\nEARLY_STOPPING_DELTA = 0.005\n\n# ── LoRA ─────────────────────────────────────────────────────────────────────\nLORA_R = 16\nLORA_ALPHA = 32\n\n# ── Monitoring ───────────────────────────────────────────────────────────────\nWANDB_PROJECT = \"tucano2-commerce\"\nEVAL_MAX_SAMPLES = 15 # eval callback samples\nEVAL_MAX_TOKENS = 512 # match training completion length\n\n# ── Task Classification (inherited from V2/V3) ──────────────────────────────\nVALID_SENTIMENTS = {\"positive\", \"negative\", \"neutral\"}\nVALID_CATEGORIES = {\n \"delivery_delay\", \"product_quality\", \"product_not_received\",\n \"wrong_product\", \"seller_communication\", \"app_issue\",\n \"price_value\", \"other\", \"none\",\n}\nVALID_CHURN = {\"low\", \"medium\", \"high\"}\nVALID_REPEAT = {\"yes\", \"no\", \"maybe\"}\nEXTRACTION_FIELDS = [\n \"sentiment\", \"sentiment_score\", \"churn_risk\", \"delivery_issue\",\n \"product_issue\", \"seller_issue\", \"main_complaint\",\n \"complaint_category\", \"repeat_intent\", \"would_recommend\",\n]\n\n# ── Verified Special Token IDs (from tokenizer_config.json) ─────────────────\n# These are constants β€” do NOT recompute via tokenizer.encode()\nTOKEN_ID_BOS = 1 # <|im_start|>\nTOKEN_ID_EOS = 2 # <|im_end|>\nTOKEN_ID_PAD = 49109 # <|pad|>\nTOKEN_ID_THINK = 49116 # <think>\nTOKEN_ID_THINK_END = 49117 # </think>\n\nprint(\"βœ“ Config loaded\")\nprint(f\" Model: {MODEL_ID}\")\nprint(f\" G={NUM_GENERATIONS}, max_comp={MAX_COMPLETION_LENGTH}, temp={TEMPERATURE}\")\nprint(f\" LR={LEARNING_RATE}, Ξ²={BETA}, scale_rewards={SCALE_REWARDS}\")\nprint(f\" LoRA r={LORA_R}, Ξ±={LORA_ALPHA}\")\nprint(f\" Max steps: {MAX_STEPS}\")"
43
  },
44
  {
45
  "cell_type": "markdown",
@@ -87,7 +87,7 @@
87
  "execution_count": null,
88
  "metadata": {},
89
  "outputs": [],
90
- "source": "FastLanguageModel.for_inference(model)\n\ntest_msgs = [\n {\"role\": \"system\", \"content\": \"VocΓͺ Γ© um assistente de IA especializado em e-commerce brasileiro.\"},\n {\"role\": \"user\", \"content\": \"Analise esta avaliaΓ§Γ£o: 'Produto chegou quebrado, pΓ©ssima embalagem. Nunca mais compro aqui.' Retorne um objeto JSON com os campos: sentiment, sentiment_score, delivery_issue, complaint_category.\"},\n]\ntext = tokenizer.apply_chat_template(test_msgs, tokenize=False, add_generation_prompt=True)\ninputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n\nt0 = time.time()\noutputs = model.generate(\n **inputs,\n max_new_tokens=256,\n temperature=0.1, # low temp for deterministic eval\n do_sample=True,\n repetition_penalty=1.0,\n)\nelapsed = time.time() - t0\n\nresponse = tokenizer.decode(outputs[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\nprint(f\"Generation time: {elapsed:.1f}s\")\nprint(f\"Response length: {len(response)} chars\")\nprint(f\"Contains <think>: {'<think>' in response}\")\nprint(f\"Contains JSON {{ }}: {'{' in response and '}' in response}\")\nprint(f\"\\n{'='*60}\")\nprint(response[:500])"
91
  },
92
  {
93
  "cell_type": "markdown",
@@ -99,7 +99,7 @@
99
  "execution_count": null,
100
  "metadata": {},
101
  "outputs": [],
102
- "source": "def strip_think(text: str) -> str:\n \"\"\"Remove <think>...</think> block, return the answer portion.\"\"\"\n return re.sub(r\"<think>.*?</think>\", \"\", text, flags=re.DOTALL).strip()\n\n\ndef has_think_block(text: str) -> bool:\n return bool(re.search(r\"<think>.+</think>\", text, flags=re.DOTALL))\n\n\ndef _classify_task_type(prompt_text: str) -> str:\n p = prompt_text.lower()\n if \"retorne um objeto json\" in p or \"extraia dados\" in p or \"json\" in p:\n return \"extraction\"\n elif \"notificaΓ§Γ£o push\" in p or \"notificaΓ§Γ£o de reengajamento\" in p:\n return \"push\"\n elif \"perfil do cliente\" in p or \"retenΓ§Γ£o\" in p or \"anΓ‘lise\" in p or \"insight\" in p:\n return \"insights\"\n else:\n return \"sql_qa\"\n\n\ndef _extract_json(text: str) -> dict | None:\n \"\"\"Extract first JSON object from text. Returns parsed dict or None.\"\"\"\n stripped = text.strip()\n stripped = re.sub(r\"^```(?:json)?\\s*\", \"\", stripped)\n stripped = re.sub(r\"\\s*```$\", \"\", stripped)\n stripped = stripped.strip()\n try:\n return json.loads(stripped)\n except (json.JSONDecodeError, TypeError):\n pass\n match = re.search(r\"\\{[^{}]*(?:\\{[^{}]*\\}[^{}]*)*\\}\", text, re.DOTALL)\n if match:\n try:\n return json.loads(match.group())\n except (json.JSONDecodeError, TypeError):\n pass\n return None\n\n\ndef reward_extraction(completion: str) -> float:\n \"\"\"Continuous reward for extraction tasks (max 1.0).\"\"\"\n answer = strip_think(completion)\n data = _extract_json(answer)\n\n if data is None:\n if \"{\" in answer and \"}\" in answer:\n return 0.05\n return 0.0\n\n if not isinstance(data, dict):\n return 0.1 # valid JSON but not an object\n\n score = 0.3 # valid JSON object\n\n # Schema completeness (0.3 total)\n present = sum(1 for f in EXTRACTION_FIELDS if f in data)\n score += 0.3 * (present / len(EXTRACTION_FIELDS))\n\n # Value validity (0.4 total, split across checks)\n checks_passed = 0\n checks_total = 0\n\n for field, validator in [\n (\"sentiment\", lambda v: v in VALID_SENTIMENTS),\n (\"complaint_category\", lambda v: v in VALID_CATEGORIES),\n (\"churn_risk\", lambda v: v in VALID_CHURN),\n (\"repeat_intent\", lambda v: v in VALID_REPEAT),\n (\"sentiment_score\", lambda v: isinstance(v, (int, float)) and 1 <= v <= 5),\n ]:\n checks_total += 1\n if field in data and validator(data[field]):\n checks_passed += 1\n\n for bool_field in (\"delivery_issue\", \"product_issue\", \"seller_issue\", \"would_recommend\"):\n checks_total += 1\n if bool_field in data and isinstance(data[bool_field], bool):\n checks_passed += 1\n\n if checks_total > 0:\n score += 0.4 * (checks_passed / checks_total)\n\n return min(score, 1.0)\n\n\ndef reward_sql_qa(completion: str) -> float:\n \"\"\"Continuous reward for SQL Q&A (max 1.0).\"\"\"\n answer = strip_think(completion)\n if not answer.strip():\n return 0.0\n\n score = 0.0\n\n # Numerical content (more numbers = more specific answer)\n numbers = re.findall(r\"\\d+(?:[.,]\\d+)?\", answer)\n score += min(0.4, 0.1 * len(numbers))\n\n # Length: 50-500 chars optimal\n length = len(answer)\n if 50 <= length <= 500:\n score += 0.3\n elif length > 0:\n score += 0.3 * max(0, 1 - abs(length - 275) / 275)\n\n # Portuguese business vocabulary\n pt_business = [\"pedidos\", \"clientes\", \"mΓ©dia\", \"total\", \"taxa\", \"vendas\",\n \"produtos\", \"perΓ­odo\", \"categoria\", \"regiΓ£o\", \"faturamento\"]\n pt_matches = sum(1 for w in pt_business if w in answer.lower())\n score += min(0.3, 0.06 * pt_matches)\n\n return min(score, 1.0)\n\n\ndef reward_insights(completion: str) -> float:\n \"\"\"Continuous reward for insights (max 1.0).\"\"\"\n answer = strip_think(completion)\n if not answer.strip():\n return 0.0\n\n score = 0.0\n\n # Actionable language\n action_words = [\"recomend\", \"implement\", \"melhor\", \"reduzir\", \"aumentar\",\n \"priorizar\", \"investir\", \"otimizar\", \"estratΓ©gi\", \"aΓ§Γ£o\"]\n matches = sum(1 for w in action_words if w in answer.lower())\n score += min(0.4, 0.08 * matches)\n\n # Length: 100-800 chars optimal\n length = len(answer)\n if 100 <= length <= 800:\n score += 0.3\n elif length > 0:\n score += 0.3 * max(0, 1 - abs(length - 450) / 450)\n\n # Structure: bullet points, numbered lists, headers\n structure_marks = len(re.findall(r\"^[-β€’*]\\s|^\\d+[.)]\\s|^#{1,3}\\s\", answer, re.MULTILINE))\n score += min(0.2, 0.04 * structure_marks)\n\n # Portuguese coherence marker\n if any(w in answer.lower() for w in [\"cliente\", \"produto\", \"serviΓ§o\", \"empresa\"]):\n score += 0.1\n\n return min(score, 1.0)\n\n\ndef reward_push(completion: str) -> float:\n \"\"\"Continuous reward for push notifications (max 1.0).\"\"\"\n answer = strip_think(completion).strip()\n if not answer:\n return 0.0\n\n # Length: ≀120 chars gets full credit\n length = len(answer)\n if length <= 120:\n length_score = 0.5\n else:\n length_score = 0.5 * max(0, 1 - (length - 120) / 120)\n\n # Portuguese content\n pt_markers = re.findall(r\"[ãçéΓͺΓ³ΓΊΓ’Γ΅]|vocΓͺ|para|como|seu|sua|oferta|desconto|produto\",\n answer, re.IGNORECASE)\n lang_score = min(0.3, 0.03 * len(pt_markers))\n\n # Non-generic (penalize very generic phrases)\n generic = [\"olΓ‘\", \"obrigado pela compra\", \"agradecemos\"]\n is_generic = any(g in answer.lower() for g in generic)\n creativity_score = 0.0 if is_generic else 0.2\n\n return min(length_score + lang_score + creativity_score, 1.0)\n\n\ndef commerce_reward_fn(completions, prompts, **kwargs) -> list[float]:\n \"\"\"Master reward function: dispatches by task type.\"\"\"\n rewards = []\n for completion, prompt in zip(completions, prompts):\n if isinstance(completion, list):\n comp_text = completion[-1][\"content\"] if completion else \"\"\n else:\n comp_text = str(completion)\n\n if isinstance(prompt, list):\n prompt_text = \" \".join(m.get(\"content\", \"\") for m in prompt)\n else:\n prompt_text = str(prompt)\n\n task = _classify_task_type(prompt_text)\n\n if task == \"extraction\":\n rewards.append(reward_extraction(comp_text))\n elif task == \"sql_qa\":\n rewards.append(reward_sql_qa(comp_text))\n elif task == \"insights\":\n rewards.append(reward_insights(comp_text))\n elif task == \"push\":\n rewards.append(reward_push(comp_text))\n else:\n r = 0.2 if comp_text.strip() else 0.0\n rewards.append(r)\n\n return rewards\n\n\nprint(\"βœ“ Reward functions defined\")"
103
  },
104
  {
105
  "cell_type": "markdown",
@@ -111,7 +111,7 @@
111
  "execution_count": null,
112
  "metadata": {},
113
  "outputs": [],
114
- "source": "# Load data, classify by task type, run calibration on 8 diverse samples\n\nby_type = {\"extraction\": [], \"sql_qa\": [], \"insights\": [], \"push\": []}\nwith open(TRAIN_FILE) as f:\n for line in f:\n row = json.loads(line)\n convs = row[\"conversations\"]\n prompt_msgs = [m for m in convs if m[\"role\"] in (\"system\", \"user\")]\n if not prompt_msgs:\n continue\n user_text = \" \".join(m[\"content\"] for m in prompt_msgs if m[\"role\"] == \"user\")\n task = _classify_task_type(user_text)\n by_type[task].append(prompt_msgs)\n\nprint(f\"Prompts by type: {', '.join(f'{k}={len(v)}' for k, v in by_type.items())}\")\n\n# Pick 2 samples per task type = 8 total\nrng = random.Random(42)\ncal_samples = []\nfor task_type in by_type:\n pool = by_type[task_type]\n if len(pool) >= 2:\n cal_samples.extend(rng.sample(pool, 2))\n elif pool:\n cal_samples.extend(pool)\n\nFastLanguageModel.for_inference(model)\nprint(f\"\\nReward calibration ({len(cal_samples)} samples):\")\nprint(\"-\" * 60)\n\ncal_rewards = []\nfor i, msgs in enumerate(cal_samples):\n text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n outputs = model.generate(\n **inputs,\n max_new_tokens=MAX_COMPLETION_LENGTH,\n temperature=0.7,\n do_sample=True,\n repetition_penalty=1.0,\n )\n response = tokenizer.decode(outputs[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n r = commerce_reward_fn([response], [text])[0]\n cal_rewards.append(r)\n task = _classify_task_type(\" \".join(m.get(\"content\", \"\") for m in msgs if m[\"role\"] == \"user\"))\n has_think = \"<think>\" in response\n answer_preview = strip_think(response)[:100]\n print(f\" Sample {i+1} [{task:12s}]: reward={r:.2f} | has_think={has_think} | {answer_preview}\")\n\nprint(f\"\\nMean={sum(cal_rewards)/len(cal_rewards):.2f}, Min={min(cal_rewards):.2f}, Max={max(cal_rewards):.2f}\")\nprint(f\"Reward variance > 0: {len(set(f'{r:.4f}' for r in cal_rewards)) > 1}\")"
115
  },
116
  {
117
  "cell_type": "markdown",
@@ -123,7 +123,7 @@
123
  "execution_count": null,
124
  "metadata": {},
125
  "outputs": [],
126
- "source": "from datasets import Dataset\n\ndef prepare_datasets(train_file, eval_ratio=EVAL_SPLIT, seed=42):\n rng = random.Random(seed)\n\n all_records = []\n with open(train_file) as f:\n for line in f:\n row = json.loads(line)\n convs = row[\"conversations\"]\n prompt_msgs = [m for m in convs if m[\"role\"] in (\"system\", \"user\")]\n if prompt_msgs:\n all_records.append(prompt_msgs)\n\n rng.shuffle(all_records)\n n_eval = max(1, int(len(all_records) * eval_ratio))\n eval_records = all_records[:n_eval]\n train_records = all_records[n_eval:]\n\n # Log task distribution\n for label, records in [(\"train\", train_records), (\"eval\", eval_records)]:\n dist = {}\n for msgs in records:\n user_text = \" \".join(m[\"content\"] for m in msgs if m[\"role\"] == \"user\")\n task = _classify_task_type(user_text)\n dist[task] = dist.get(task, 0) + 1\n print(f\" {label}: {len(records)} prompts β€” {dist}\")\n\n train_ds = Dataset.from_list([{\"prompt\": msgs} for msgs in train_records])\n eval_ds = Dataset.from_list([{\"prompt\": msgs} for msgs in eval_records])\n return train_ds, eval_ds\n\ntrain_dataset, eval_dataset = prepare_datasets(TRAIN_FILE)\nprint(f\"\\nβœ“ Datasets: train={len(train_dataset)}, eval={len(eval_dataset)}\")"
127
  },
128
  {
129
  "cell_type": "markdown",
 
39
  "execution_count": null,
40
  "metadata": {},
41
  "outputs": [],
42
+ "source": "import os\nimport json\nimport re\nimport time\nimport random\nimport gc\nimport warnings\nfrom pathlib import Path\n\n# ── Suppress noisy deprecation warnings from Transformers 5.5.0 ──────────────\nwarnings.filterwarnings(\"ignore\", message=\".*AttentionMaskConverter.*\")\nwarnings.filterwarnings(\"ignore\", message=\".*Passing `generation_config` together with.*\")\nwarnings.filterwarnings(\"ignore\", message=\".*max_new_tokens.*max_length.*\")\nwarnings.filterwarnings(\"ignore\", category=FutureWarning)\n\n# ── Disable Unsloth kernel recompilation ───────────────────────��─────────────\nos.environ[\"UNSLOTH_COMPILE_DISABLE\"] = \"1\"\nos.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n\n# ── Model ────────────────────────────────────────────────────────────────────\nMODEL_ID = \"Polygl0t/Tucano2-qwen-0.5B-Instruct\"\nMAX_SEQ_LENGTH = 2048 # model supports 4096, but 2048 is plenty for Instruct (no <think> overhead)\nMODELS_DIR = Path(\"/home/jupyter/tucano2/models\")\nADAPTER_DIR = MODELS_DIR / \"tucano2-0.5B-instruct-grpo-v4\"\nCHECKPOINT_DIR = ADAPTER_DIR / \"checkpoints\"\n\n# ── Data ─────────────────────────────────────────────────────────────────────\nDATA_DIR = Path(\"/home/jupyter/tucano2/data\")\nTRAIN_FILE = DATA_DIR / \"pairs\" / \"train.jsonl\"\nEVAL_SPLIT = 0.10 # 10% held out for eval\n\n# ── GRPO Hyperparameters ─────────────────────────────────────────────────────\nNUM_GENERATIONS = 16 # 0.5B + short completions = VRAM allows G=16\nMAX_COMPLETION_LENGTH = 512 # Instruct: no <think> overhead. Extraction ~100, SQL ~200, insights ~300\nTEMPERATURE = 1.0 # Skywork-OR1: Ο„=1.0 for exploration\nLEARNING_RATE = 2e-6 # Dr. GRPO: 4Γ— V2's 5e-7 (clip_ratio=0 β†’ push harder)\nBETA = 0.0 # Dr. GRPO Β§3.2: Ξ²=0 optimal for rule-based rewards\nSCALE_REWARDS = False # Dr. GRPO: remove std normalization bias\nBATCH_SIZE = 2 # per-device batch size\nGRAD_ACCUM = 1 # effective batch = 2 * 1 = 2 prompts * 16 gen = 32 completions\nMAX_STEPS = 200 # validation run\nSAVE_STEPS = 20\nEVAL_STEPS = 10\nEARLY_STOPPING_PATIENCE = 15\nEARLY_STOPPING_DELTA = 0.005\n\n# ── LoRA ─────────────────────────────────────────────────────────────────────\nLORA_R = 16\nLORA_ALPHA = 32\n\n# ── Monitoring ───────────────────────────────────────────────────────────────\nWANDB_PROJECT = \"tucano2-commerce\"\nEVAL_MAX_SAMPLES = 15 # eval callback samples\nEVAL_MAX_TOKENS = 512 # match training completion length\n\n# ── Task Classification (inherited from V2/V3) ──────────────────────────────\nVALID_SENTIMENTS = {\"positive\", \"negative\", \"neutral\"}\nVALID_CATEGORIES = {\n \"delivery_delay\", \"product_quality\", \"product_not_received\",\n \"wrong_product\", \"seller_communication\", \"app_issue\",\n \"price_value\", \"other\", \"none\",\n}\nVALID_CHURN = {\"low\", \"medium\", \"high\"}\nVALID_REPEAT = {\"yes\", \"no\", \"maybe\"}\nEXTRACTION_FIELDS = [\n \"sentiment\", \"sentiment_score\", \"churn_risk\", \"delivery_issue\",\n \"product_issue\", \"seller_issue\", \"main_complaint\",\n \"complaint_category\", \"repeat_intent\", \"would_recommend\",\n]\n\n# ── Verified Special Token IDs (from tokenizer_config.json) ─────────────────\n# These are constants β€” do NOT recompute via tokenizer.encode()\nTOKEN_ID_BOS = 1 # <|im_start|>\nTOKEN_ID_EOS = 2 # <|im_end|>\nTOKEN_ID_PAD = 49109 # <|pad|>\nTOKEN_ID_THINK = 49116 # <think>\nTOKEN_ID_THINK_END = 49117 # </think>\n\n# ══════════════════════════════════════════════════════════════════════════════\n# TASK-AWARE SYSTEM PROMPTS (inherited from V3)\n# Research basis:\n# - OptimalThinkingBench (2508.13141): task-specific instructions improve accuracy\n# - V3 calibration: extraction prompt \"Retorne APENAS um objeto JSON\" teaches format\n# - Cell 7 evidence: without this, model adds explanation text after JSON\n# ══════════════════════════════════════════════════════════════════════════════\n\nSYSTEM_EXTRACTION = (\n \"VocΓͺ Γ© um motor de extraΓ§Γ£o de dados de e-commerce brasileiro. \"\n \"Retorne APENAS um objeto JSON vΓ‘lido, sem nenhum texto antes ou depois. \"\n \"NΓƒO USE blocos de cΓ³digo markdown (```json). \"\n \"O primeiro caractere da sua resposta deve ser { e o ΓΊltimo deve ser }. \"\n \"Campos nΓ£o mencionados na avaliaΓ§Γ£o devem ser null β€” nunca invente valores. \"\n \"Sem explicaοΏ½οΏ½Γ£o. Sem comentΓ‘rios.\"\n)\n\nSYSTEM_SQL = (\n \"VocΓͺ Γ© um assistente de IA especializado em anΓ‘lise de e-commerce brasileiro. \"\n \"VocΓͺ compreende avaliaΓ§Γ΅es de clientes em portuguΓͺs e padrΓ΅es de comΓ©rcio brasileiro.\\n\\n\"\n \"Para consultas e anΓ‘lises de dados: apresente a resposta de forma direta \"\n \"com nΓΊmeros e dados concretos. Seja conciso.\"\n)\n\nSYSTEM_INSIGHTS = (\n \"VocΓͺ Γ© um assistente de IA especializado em anΓ‘lise de e-commerce brasileiro. \"\n \"VocΓͺ compreende avaliaΓ§Γ΅es de clientes em portuguΓͺs e padrΓ΅es de comΓ©rcio brasileiro.\\n\\n\"\n \"Para anΓ‘lises estratΓ©gicas: raciocine de forma estruturada e concisa, \"\n \"focando nos pontos principais e recomendaΓ§Γ΅es acionΓ‘veis.\"\n)\n\nSYSTEM_PUSH = (\n \"VocΓͺ Γ© um assistente de IA especializado em anΓ‘lise de e-commerce brasileiro. \"\n \"VocΓͺ compreende avaliaΓ§Γ΅es de clientes em portuguΓͺs e padrΓ΅es de comΓ©rcio brasileiro.\\n\\n\"\n \"Para notificaΓ§Γ΅es push: seja direto e criativo. \"\n \"A notificaΓ§Γ£o deve ter no mΓ‘ximo 120 caracteres. \"\n \"Responda diretamente.\"\n)\n\nSYSTEM_PT = (\n \"VocΓͺ Γ© um assistente de IA especializado em anΓ‘lise de e-commerce brasileiro. \"\n \"VocΓͺ compreende avaliaΓ§Γ΅es de clientes em portuguΓͺs e padrΓ΅es de comΓ©rcio brasileiro.\"\n)\n\ndef get_system_prompt(task_type: str) -> str:\n return {\n \"extraction\": SYSTEM_EXTRACTION,\n \"sql_qa\": SYSTEM_SQL,\n \"insights\": SYSTEM_INSIGHTS,\n \"push\": SYSTEM_PUSH,\n }.get(task_type, SYSTEM_PT)\n\ndef inject_task_system_prompt(msgs, task_type):\n \"\"\"Replace generic system prompt with task-specific one.\"\"\"\n new_msgs = []\n system_prompt = get_system_prompt(task_type)\n has_system = False\n for m in msgs:\n if m[\"role\"] == \"system\":\n new_msgs.append({\"role\": \"system\", \"content\": system_prompt})\n has_system = True\n else:\n new_msgs.append(m)\n if not has_system:\n new_msgs.insert(0, {\"role\": \"system\", \"content\": system_prompt})\n return new_msgs\n\nprint(\"βœ“ Task-aware system prompts defined\")\n\nprint(\"βœ“ Config loaded\")\nprint(f\" Model: {MODEL_ID}\")\nprint(f\" G={NUM_GENERATIONS}, max_comp={MAX_COMPLETION_LENGTH}, temp={TEMPERATURE}\")\nprint(f\" LR={LEARNING_RATE}, Ξ²={BETA}, scale_rewards={SCALE_REWARDS}\")\nprint(f\" LoRA r={LORA_R}, Ξ±={LORA_ALPHA}\")\nprint(f\" Max steps: {MAX_STEPS}\")"
43
  },
44
  {
45
  "cell_type": "markdown",
 
87
  "execution_count": null,
88
  "metadata": {},
89
  "outputs": [],
90
+ "source": "FastLanguageModel.for_inference(model)\n\ntest_msgs = [\n {\"role\": \"system\", \"content\": SYSTEM_EXTRACTION},\n {\"role\": \"user\", \"content\": \"Analise esta avaliaΓ§Γ£o: 'Produto chegou quebrado, pΓ©ssima embalagem. Nunca mais compro aqui.' Retorne um objeto JSON com os campos: sentiment, sentiment_score, delivery_issue, complaint_category.\"},\n]\ntext = tokenizer.apply_chat_template(test_msgs, tokenize=False, add_generation_prompt=True)\ninputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n\nt0 = time.time()\noutputs = model.generate(\n **inputs,\n max_new_tokens=256,\n temperature=0.1, # low temp for deterministic eval\n do_sample=True,\n repetition_penalty=1.0,\n)\nelapsed = time.time() - t0\n\nresponse = tokenizer.decode(outputs[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\nprint(f\"Generation time: {elapsed:.1f}s\")\nprint(f\"Response length: {len(response)} chars\")\nprint(f\"Contains <think>: {'<think>' in response}\")\nprint(f\"Contains JSON {{ }}: {'{' in response and '}' in response}\")\nprint(f\"\\n{'='*60}\")\nprint(response[:500])"
91
  },
92
  {
93
  "cell_type": "markdown",
 
99
  "execution_count": null,
100
  "metadata": {},
101
  "outputs": [],
102
+ "source": "def strip_think(text: str) -> str:\n \"\"\"Remove <think>...</think> block, return the answer portion.\"\"\"\n return re.sub(r\"<think>.*?</think>\", \"\", text, flags=re.DOTALL).strip()\n\n\ndef has_think_block(text: str) -> bool:\n return bool(re.search(r\"<think>.+</think>\", text, flags=re.DOTALL))\n\n\ndef _classify_task_type(prompt_text: str) -> str:\n p = prompt_text.lower()\n if \"retorne um objeto json\" in p or \"extraia dados\" in p or \"json\" in p:\n return \"extraction\"\n elif \"notificaΓ§Γ£o push\" in p or \"notificaΓ§Γ£o de reengajamento\" in p:\n return \"push\"\n elif \"perfil do cliente\" in p or \"retenΓ§Γ£o\" in p or \"anΓ‘lise\" in p or \"insight\" in p:\n return \"insights\"\n else:\n return \"sql_qa\"\n\n\ndef _extract_json(text: str) -> dict | None:\n \"\"\"Extract first JSON object from text. Returns parsed dict or None.\"\"\"\n stripped = text.strip()\n stripped = re.sub(r\"^```(?:json)?\\s*\", \"\", stripped)\n stripped = re.sub(r\"\\s*```$\", \"\", stripped)\n stripped = stripped.strip()\n try:\n return json.loads(stripped)\n except (json.JSONDecodeError, TypeError):\n pass\n match = re.search(r\"\\{[^{}]*(?:\\{[^{}]*\\}[^{}]*)*\\}\", text, re.DOTALL)\n if match:\n try:\n return json.loads(match.group())\n except (json.JSONDecodeError, TypeError):\n pass\n return None\n\n\ndef reward_extraction(completion: str) -> float:\n \"\"\"Continuous reward for extraction tasks (max 1.0).\"\"\"\n answer = strip_think(completion)\n data = _extract_json(answer)\n\n if data is None:\n if \"{\" in answer and \"}\" in answer:\n return 0.05\n return 0.0\n\n if not isinstance(data, dict):\n return 0.1 # valid JSON but not an object\n\n score = 0.3 # valid JSON object\n\n # Schema completeness (0.3 total)\n present = sum(1 for f in EXTRACTION_FIELDS if f in data)\n score += 0.3 * (present / len(EXTRACTION_FIELDS))\n\n # Value validity (0.4 total, split across checks)\n checks_passed = 0\n checks_total = 0\n\n for field, validator in [\n (\"sentiment\", lambda v: isinstance(v, str) and v in VALID_SENTIMENTS),\n (\"complaint_category\", lambda v: isinstance(v, str) and v in VALID_CATEGORIES),\n (\"churn_risk\", lambda v: isinstance(v, str) and v in VALID_CHURN),\n (\"repeat_intent\", lambda v: isinstance(v, str) and v in VALID_REPEAT),\n (\"sentiment_score\", lambda v: isinstance(v, (int, float)) and 1 <= v <= 5),\n ]:\n checks_total += 1\n if field in data and validator(data[field]):\n checks_passed += 1\n\n for bool_field in (\"delivery_issue\", \"product_issue\", \"seller_issue\", \"would_recommend\"):\n checks_total += 1\n if bool_field in data and isinstance(data[bool_field], bool):\n checks_passed += 1\n\n if checks_total > 0:\n score += 0.4 * (checks_passed / checks_total)\n\n return min(score, 1.0)\n\n\ndef reward_sql_qa(completion: str) -> float:\n \"\"\"Continuous reward for SQL Q&A (max 1.0).\"\"\"\n answer = strip_think(completion)\n if not answer.strip():\n return 0.0\n\n score = 0.0\n\n # Numerical content (more numbers = more specific answer)\n numbers = re.findall(r\"\\d+(?:[.,]\\d+)?\", answer)\n score += min(0.4, 0.1 * len(numbers))\n\n # Length: 50-500 chars optimal\n length = len(answer)\n if 50 <= length <= 500:\n score += 0.3\n elif length > 0:\n score += 0.3 * max(0, 1 - abs(length - 275) / 275)\n\n # Portuguese business vocabulary\n pt_business = [\"pedidos\", \"clientes\", \"mΓ©dia\", \"total\", \"taxa\", \"vendas\",\n \"produtos\", \"perΓ­odo\", \"categoria\", \"regiΓ£o\", \"faturamento\"]\n pt_matches = sum(1 for w in pt_business if w in answer.lower())\n score += min(0.3, 0.06 * pt_matches)\n\n return min(score, 1.0)\n\n\ndef reward_insights(completion: str) -> float:\n \"\"\"Continuous reward for insights (max 1.0).\"\"\"\n answer = strip_think(completion)\n if not answer.strip():\n return 0.0\n\n score = 0.0\n\n # Actionable language\n action_words = [\"recomend\", \"implement\", \"melhor\", \"reduzir\", \"aumentar\",\n \"priorizar\", \"investir\", \"otimizar\", \"estratΓ©gi\", \"aΓ§Γ£o\"]\n matches = sum(1 for w in action_words if w in answer.lower())\n score += min(0.4, 0.08 * matches)\n\n # Length: 100-800 chars optimal\n length = len(answer)\n if 100 <= length <= 800:\n score += 0.3\n elif length > 0:\n score += 0.3 * max(0, 1 - abs(length - 450) / 450)\n\n # Structure: bullet points, numbered lists, headers\n structure_marks = len(re.findall(r\"^[-β€’*]\\s|^\\d+[.)]\\s|^#{1,3}\\s\", answer, re.MULTILINE))\n score += min(0.2, 0.04 * structure_marks)\n\n # Portuguese coherence marker\n if any(w in answer.lower() for w in [\"cliente\", \"produto\", \"serviΓ§o\", \"empresa\"]):\n score += 0.1\n\n return min(score, 1.0)\n\n\ndef reward_push(completion: str) -> float:\n \"\"\"Continuous reward for push notifications (max 1.0).\"\"\"\n answer = strip_think(completion).strip()\n if not answer:\n return 0.0\n\n # Length: ≀120 chars gets full credit\n length = len(answer)\n if length <= 120:\n length_score = 0.5\n else:\n length_score = 0.5 * max(0, 1 - (length - 120) / 120)\n\n # Portuguese content\n pt_markers = re.findall(r\"[ãçéΓͺΓ³ΓΊΓ’Γ΅]|vocΓͺ|para|como|seu|sua|oferta|desconto|produto\",\n answer, re.IGNORECASE)\n lang_score = min(0.3, 0.03 * len(pt_markers))\n\n # Non-generic (penalize very generic phrases)\n generic = [\"olΓ‘\", \"obrigado pela compra\", \"agradecemos\"]\n is_generic = any(g in answer.lower() for g in generic)\n creativity_score = 0.0 if is_generic else 0.2\n\n return min(length_score + lang_score + creativity_score, 1.0)\n\n\ndef commerce_reward_fn(completions, prompts, **kwargs) -> list[float]:\n \"\"\"Master reward function: dispatches by task type.\"\"\"\n rewards = []\n for completion, prompt in zip(completions, prompts):\n if isinstance(completion, list):\n comp_text = completion[-1][\"content\"] if completion else \"\"\n else:\n comp_text = str(completion)\n\n if isinstance(prompt, list):\n prompt_text = \" \".join(m.get(\"content\", \"\") for m in prompt)\n else:\n prompt_text = str(prompt)\n\n task = _classify_task_type(prompt_text)\n\n if task == \"extraction\":\n rewards.append(reward_extraction(comp_text))\n elif task == \"sql_qa\":\n rewards.append(reward_sql_qa(comp_text))\n elif task == \"insights\":\n rewards.append(reward_insights(comp_text))\n elif task == \"push\":\n rewards.append(reward_push(comp_text))\n else:\n r = 0.2 if comp_text.strip() else 0.0\n rewards.append(r)\n\n return rewards\n\n\nprint(\"βœ“ Reward functions defined\")"
103
  },
104
  {
105
  "cell_type": "markdown",
 
111
  "execution_count": null,
112
  "metadata": {},
113
  "outputs": [],
114
+ "source": "# Load data, classify by task type, run calibration on 8 diverse samples\n\nby_type = {\"extraction\": [], \"sql_qa\": [], \"insights\": [], \"push\": []}\nwith open(TRAIN_FILE) as f:\n for line in f:\n row = json.loads(line)\n convs = row[\"conversations\"]\n prompt_msgs = [m for m in convs if m[\"role\"] in (\"system\", \"user\")]\n if not prompt_msgs:\n continue\n user_text = \" \".join(m[\"content\"] for m in prompt_msgs if m[\"role\"] == \"user\")\n task = _classify_task_type(user_text)\n by_type[task].append(prompt_msgs)\n\nprint(f\"Prompts by type: {', '.join(f'{k}={len(v)}' for k, v in by_type.items())}\")\n\n# Pick 2 samples per task type = 8 total\nrng = random.Random(42)\ncal_samples = []\nfor task_type in by_type:\n pool = by_type[task_type]\n if len(pool) >= 2:\n cal_samples.extend(rng.sample(pool, 2))\n elif pool:\n cal_samples.extend(pool)\n\nFastLanguageModel.for_inference(model)\nprint(f\"\\nReward calibration ({len(cal_samples)} samples):\")\nprint(\"-\" * 60)\n\ncal_rewards = []\nfor i, msgs in enumerate(cal_samples):\n # Inject task-aware system prompt\n user_text_cal = \" \".join(m.get(\"content\", \"\") for m in msgs if m[\"role\"] == \"user\")\n task_cal = _classify_task_type(user_text_cal)\n msgs = inject_task_system_prompt(msgs, task_cal)\n text = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)\n inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n outputs = model.generate(\n **inputs,\n max_new_tokens=MAX_COMPLETION_LENGTH,\n temperature=0.7,\n do_sample=True,\n repetition_penalty=1.0,\n )\n response = tokenizer.decode(outputs[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n r = commerce_reward_fn([response], [text])[0]\n cal_rewards.append(r)\n task = _classify_task_type(\" \".join(m.get(\"content\", \"\") for m in msgs if m[\"role\"] == \"user\"))\n has_think = \"<think>\" in response\n answer_preview = strip_think(response)[:100]\n print(f\" Sample {i+1} [{task:12s}]: reward={r:.2f} | has_think={has_think} | {answer_preview}\")\n\nprint(f\"\\nMean={sum(cal_rewards)/len(cal_rewards):.2f}, Min={min(cal_rewards):.2f}, Max={max(cal_rewards):.2f}\")\nprint(f\"Reward variance > 0: {len(set(f'{r:.4f}' for r in cal_rewards)) > 1}\")"
115
  },
116
  {
117
  "cell_type": "markdown",
 
123
  "execution_count": null,
124
  "metadata": {},
125
  "outputs": [],
126
+ "source": "from datasets import Dataset\n\ndef prepare_datasets(train_file, eval_ratio=EVAL_SPLIT, seed=42):\n rng = random.Random(seed)\n\n all_records = []\n with open(train_file) as f:\n for line in f:\n row = json.loads(line)\n convs = row[\"conversations\"]\n prompt_msgs = [m for m in convs if m[\"role\"] in (\"system\", \"user\")]\n if prompt_msgs:\n all_records.append(prompt_msgs)\n\n rng.shuffle(all_records)\n n_eval = max(1, int(len(all_records) * eval_ratio))\n eval_records = all_records[:n_eval]\n train_records = all_records[n_eval:]\n\n # ── Inject task-aware system prompts (V3 prompt engineering) ─────────────\n for i, msgs in enumerate(train_records):\n user_text = \" \".join(m[\"content\"] for m in msgs if m[\"role\"] == \"user\")\n task = _classify_task_type(user_text)\n train_records[i] = inject_task_system_prompt(msgs, task)\n for i, msgs in enumerate(eval_records):\n user_text = \" \".join(m[\"content\"] for m in msgs if m[\"role\"] == \"user\")\n task = _classify_task_type(user_text)\n eval_records[i] = inject_task_system_prompt(msgs, task)\n print(\" βœ“ Task-aware system prompts injected\")\n\n # Log task distribution\n for label, records in [(\"train\", train_records), (\"eval\", eval_records)]:\n dist = {}\n for msgs in records:\n user_text = \" \".join(m[\"content\"] for m in msgs if m[\"role\"] == \"user\")\n task = _classify_task_type(user_text)\n dist[task] = dist.get(task, 0) + 1\n print(f\" {label}: {len(records)} prompts β€” {dist}\")\n\n train_ds = Dataset.from_list([{\"prompt\": msgs} for msgs in train_records])\n eval_ds = Dataset.from_list([{\"prompt\": msgs} for msgs in eval_records])\n return train_ds, eval_ds\n\ntrain_dataset, eval_dataset = prepare_datasets(TRAIN_FILE)\nprint(f\"\\nβœ“ Datasets: train={len(train_dataset)}, eval={len(eval_dataset)}\")"
127
  },
128
  {
129
  "cell_type": "markdown",