Add 03_ecommerce_finetune.ipynb — next-purchase prediction with JointFusion, LightGBM baseline comparison

Browse files

Files changed (1) hide show

notebooks/03_ecommerce_finetune.ipynb +468 -0

notebooks/03_ecommerce_finetune.ipynb ADDED Viewed

	@@ -0,0 +1,468 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 03 — E-Commerce Fine-Tuning: Next-Purchase Prediction\n",
+    "\n",
+    "**Goal:** Fine-tune the pre-trained DomainTransformer for predicting whether a user will make a purchase, and compare against a LightGBM baseline on hand-crafted features.\n",
+    "\n",
+    "**Task:** Binary classification — given a user's event sequence, predict if they will purchase (1) or not (0).\n",
+    "\n",
+    "**Pre-trained model:** [rtferraz/ecommerce-domain-24m](https://huggingface.co/rtferraz/ecommerce-domain-24m)\n",
+    "\n",
+    "**Architecture:** JointFusionModel (pre-trained Transformer + DCNv2 with PLR tabular embeddings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install datasets transformers torch accelerate tokenizers numpy pandas matplotlib scikit-learn wandb huggingface_hub lightgbm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging, pickle, os, sys, gc\n",
+    "from datetime import datetime\n",
+    "from collections import Counter\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import torch\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import roc_auc_score, classification_report\n",
+    "\n",
+    "if os.path.exists('../src'): sys.path.insert(0, '../src')\n",
+    "elif os.path.exists('src'): sys.path.insert(0, 'src')\n",
+    "\n",
+    "from domain_tokenizer import (\n",
+    "    DomainTokenizerBuilder, DomainTransformerConfig,\n",
+    "    DomainTransformerForCausalLM, JointFusionModel,\n",
+    "    DomainFinetuneDataset, prepare_finetune_dataset, finetune_domain_model,\n",
+    ")\n",
+    "from domain_tokenizer.schema import DomainSchema, FieldSpec, FieldType\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')\n",
+    "print(f'torch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f'GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "login()\n",
+    "\n",
+    "import wandb\n",
+    "wandb.login()\n",
+    "os.environ['WANDB_PROJECT'] = 'domainTokenizer'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1 — Load Pre-trained Artifacts\n",
+    "\n",
+    "Load the artifacts saved by `02_ecommerce_pretrain.ipynb`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load user sequences from pre-training notebook\n",
+    "with open('./ecommerce_artifacts.pkl', 'rb') as f:\n",
+    "    artifacts = pickle.load(f)\n",
+    "\n",
+    "user_sequences = artifacts['user_sequences']\n",
+    "user_ids = artifacts['user_ids']\n",
+    "print(f'Loaded {len(user_sequences):,} users')\n",
+    "\n",
+    "# Load tokenizer\n",
+    "from transformers import PreTrainedTokenizerFast\n",
+    "hf_tokenizer = PreTrainedTokenizerFast.from_pretrained('./ecommerce_tokenizer')\n",
+    "print(f'Tokenizer vocab: {hf_tokenizer.vocab_size}')\n",
+    "\n",
+    "# Rebuild the schema and builder (needed for tokenize_event)\n",
+    "ECOMMERCE_REES46_SCHEMA = DomainSchema(\n",
+    "    name='ecommerce_rees46',\n",
+    "    fields=[\n",
+    "        FieldSpec(name='event_type', field_type=FieldType.CATEGORICAL_FIXED, prefix='EVT',\n",
+    "                  categories=['view', 'cart', 'remove_from_cart', 'purchase']),\n",
+    "        FieldSpec(name='price', field_type=FieldType.NUMERICAL_CONTINUOUS, prefix='PRICE', n_bins=21),\n",
+    "        FieldSpec(name='category', field_type=FieldType.TEXT, prefix='CAT'),\n",
+    "        FieldSpec(name='timestamp', field_type=FieldType.TEMPORAL, calendar_fields=['dow', 'hour']),\n",
+    "    ],\n",
+    ")\n",
+    "builder = DomainTokenizerBuilder(ECOMMERCE_REES46_SCHEMA)\n",
+    "all_events_flat = [e for seq in user_sequences for e in seq]\n",
+    "builder.fit(all_events_flat)\n",
+    "del all_events_flat; gc.collect()\n",
+    "print('Builder fitted')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load pre-trained model\n",
+    "config = DomainTransformerConfig.from_preset('24m', vocab_size=hf_tokenizer.vocab_size)\n",
+    "model = DomainTransformerForCausalLM(config)\n",
+    "model.load_state_dict(torch.load('./ecommerce_pretrain_checkpoints/final/model.safetensors',\n",
+    "                                  map_location='cpu', weights_only=True), strict=False)\n",
+    "print(f'Pre-trained model loaded: {sum(p.numel() for p in model.parameters()):,} params')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2 — Create Labels and Tabular Features\n",
+    "\n",
+    "**Label:** Binary — did the user make at least one purchase? (1=yes, 0=no)\n",
+    "\n",
+    "**Tabular features:** Hand-crafted from user sequences (for the DCNv2 branch and LightGBM baseline)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_user_features(events):\n",
+    "    \"\"\"Extract tabular features from a user's event sequence.\"\"\"\n",
+    "    n_events = len(events)\n",
+    "    n_views = sum(1 for e in events if e['event_type'] == 'view')\n",
+    "    n_carts = sum(1 for e in events if e['event_type'] == 'cart')\n",
+    "    n_purchases = sum(1 for e in events if e['event_type'] == 'purchase')\n",
+    "    n_removes = sum(1 for e in events if e['event_type'] == 'remove_from_cart')\n",
+    "    \n",
+    "    prices = [e['price'] for e in events if e['price'] > 0]\n",
+    "    avg_price = np.mean(prices) if prices else 0\n",
+    "    max_price = max(prices) if prices else 0\n",
+    "    std_price = np.std(prices) if len(prices) > 1 else 0\n",
+    "    \n",
+    "    categories = set(e['category'] for e in events)\n",
+    "    n_unique_categories = len(categories)\n",
+    "    \n",
+    "    # Temporal features\n",
+    "    hours = [e['timestamp'].hour for e in events]\n",
+    "    avg_hour = np.mean(hours)\n",
+    "    \n",
+    "    # Conversion funnel ratios\n",
+    "    cart_rate = n_carts / max(n_views, 1)\n",
+    "    purchase_rate = n_purchases / max(n_events, 1)\n",
+    "    remove_rate = n_removes / max(n_carts, 1) if n_carts > 0 else 0\n",
+    "    \n",
+    "    return [\n",
+    "        n_events, n_views, n_carts, n_purchases, n_removes,\n",
+    "        avg_price, max_price, std_price,\n",
+    "        n_unique_categories,\n",
+    "        avg_hour,\n",
+    "        cart_rate, purchase_rate, remove_rate,\n",
+    "    ]\n",
+    "\n",
+    "FEATURE_NAMES = [\n",
+    "    'n_events', 'n_views', 'n_carts', 'n_purchases', 'n_removes',\n",
+    "    'avg_price', 'max_price', 'std_price',\n",
+    "    'n_unique_categories',\n",
+    "    'avg_hour',\n",
+    "    'cart_rate', 'purchase_rate', 'remove_rate',\n",
+    "]\n",
+    "\n",
+    "print(f'Computing features for {len(user_sequences):,} users...')\n",
+    "tabular_features = np.array([compute_user_features(seq) for seq in user_sequences], dtype=np.float32)\n",
+    "labels = np.array([1.0 if any(e['event_type'] == 'purchase' for e in seq) else 0.0 for seq in user_sequences])\n",
+    "\n",
+    "print(f'Features shape: {tabular_features.shape}')\n",
+    "print(f'Labels: {labels.sum():.0f} purchasers / {len(labels)} total ({labels.mean()*100:.1f}%)')\n",
+    "print(f'Feature names: {FEATURE_NAMES}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train/test split (80/20, stratified by label)\n",
+    "train_idx, test_idx = train_test_split(\n",
+    "    range(len(user_sequences)), test_size=0.2, random_state=42, stratify=labels\n",
+    ")\n",
+    "\n",
+    "train_seqs = [user_sequences[i] for i in train_idx]\n",
+    "test_seqs = [user_sequences[i] for i in test_idx]\n",
+    "train_features = tabular_features[train_idx]\n",
+    "test_features = tabular_features[test_idx]\n",
+    "train_labels = labels[train_idx]\n",
+    "test_labels = labels[test_idx]\n",
+    "\n",
+    "print(f'Train: {len(train_seqs):,} users ({train_labels.mean()*100:.1f}% positive)')\n",
+    "print(f'Test: {len(test_seqs):,} users ({test_labels.mean()*100:.1f}% positive)')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3 — LightGBM Baseline\n",
+    "\n",
+    "Standard ML baseline: LightGBM on hand-crafted tabular features. This is what we need to beat."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lightgbm as lgb\n",
+    "\n",
+    "lgb_model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42, verbose=-1)\n",
+    "lgb_model.fit(train_features, train_labels)\n",
+    "\n",
+    "lgb_train_probs = lgb_model.predict_proba(train_features)[:, 1]\n",
+    "lgb_test_probs = lgb_model.predict_proba(test_features)[:, 1]\n",
+    "\n",
+    "lgb_train_auc = roc_auc_score(train_labels, lgb_train_probs)\n",
+    "lgb_test_auc = roc_auc_score(test_labels, lgb_test_probs)\n",
+    "\n",
+    "print(f'LightGBM Baseline:')\n",
+    "print(f'  Train AUC: {lgb_train_auc:.4f}')\n",
+    "print(f'  Test AUC:  {lgb_test_auc:.4f}')\n",
+    "\n",
+    "# Feature importance\n",
+    "importance = pd.Series(lgb_model.feature_importances_, index=FEATURE_NAMES).sort_values(ascending=False)\n",
+    "print(f'\\nTop features:')\n",
+    "for feat, imp in importance.head(5).items():\n",
+    "    print(f'  {feat}: {imp}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4 — JointFusionModel Fine-Tuning\n",
+    "\n",
+    "The JointFusionModel combines:\n",
+    "- **Transaction branch:** Pre-trained DomainTransformer → user embedding\n",
+    "- **Tabular branch:** DCNv2 with PLR embeddings on hand-crafted features\n",
+    "- **Joint head:** MLP on concatenated embeddings → binary prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create fine-tuning datasets\n",
+    "MAX_LENGTH = 256  # tokens per user sequence\n",
+    "\n",
+    "train_dataset = DomainFinetuneDataset(\n",
+    "    train_seqs, train_features, train_labels,\n",
+    "    builder, hf_tokenizer, max_length=MAX_LENGTH,\n",
+    ")\n",
+    "test_dataset = DomainFinetuneDataset(\n",
+    "    test_seqs, test_features, test_labels,\n",
+    "    builder, hf_tokenizer, max_length=MAX_LENGTH,\n",
+    ")\n",
+    "\n",
+    "print(f'Train dataset: {len(train_dataset)} samples')\n",
+    "print(f'Test dataset: {len(test_dataset)} samples')\n",
+    "print(f'Sample: {set(train_dataset[0].keys())}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create JointFusionModel\n",
+    "fusion_model = JointFusionModel(\n",
+    "    transformer_model=model,\n",
+    "    n_tabular_features=len(FEATURE_NAMES),\n",
+    "    n_classes=1,  # binary\n",
+    "    plr_frequencies=32,\n",
+    "    plr_embedding_dim=32,\n",
+    "    dcn_cross_layers=3,\n",
+    "    dcn_deep_layers=2,\n",
+    "    dcn_deep_dim=128,\n",
+    "    head_hidden_dim=128,\n",
+    "    dropout=0.1,\n",
+    ")\n",
+    "\n",
+    "n_params = sum(p.numel() for p in fusion_model.parameters())\n",
+    "print(f'JointFusion model: {n_params:,} params (transformer + DCNv2 + head)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "USE_GPU = torch.cuda.is_available()\n",
+    "GPU_NAME = torch.cuda.get_device_name(0) if USE_GPU else ''\n",
+    "USE_BF16 = USE_GPU and 'T4' not in GPU_NAME\n",
+    "USE_FP16 = USE_GPU and not USE_BF16\n",
+    "\n",
+    "trainer = finetune_domain_model(\n",
+    "    model=fusion_model,\n",
+    "    train_dataset=train_dataset,\n",
+    "    eval_dataset=test_dataset,\n",
+    "    output_dir='./ecommerce_finetune_checkpoints',\n",
+    "    num_epochs=5 if USE_GPU else 2,\n",
+    "    per_device_batch_size=32 if USE_GPU else 8,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    learning_rate=1e-4,\n",
+    "    warmup_steps=50,\n",
+    "    logging_steps=20,\n",
+    "    eval_steps=100 if USE_GPU else 50,\n",
+    "    save_strategy='no',\n",
+    "    bf16=USE_BF16,\n",
+    "    fp16=USE_FP16,\n",
+    "    report_to='wandb',\n",
+    "    run_name='ecommerce-finetune-joint-5ep',\n",
+    "    seed=42,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5 — Evaluate and Compare"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get predictions from JointFusion model\n",
+    "fusion_model.eval()\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "fusion_model = fusion_model.to(device)\n",
+    "\n",
+    "all_probs, all_labels = [], []\n",
+    "loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    for batch in loader:\n",
+    "        batch = {k: v.to(device) for k, v in batch.items()}\n",
+    "        labels_batch = batch.pop('labels')\n",
+    "        out = fusion_model(**batch)\n",
+    "        probs = torch.sigmoid(out['logits'].squeeze(-1))\n",
+    "        all_probs.extend(probs.cpu().numpy())\n",
+    "        all_labels.extend(labels_batch.cpu().numpy())\n",
+    "\n",
+    "all_probs = np.array(all_probs)\n",
+    "all_labels = np.array(all_labels)\n",
+    "\n",
+    "fusion_test_auc = roc_auc_score(all_labels, all_probs)\n",
+    "print(f'JointFusion Test AUC: {fusion_test_auc:.4f}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Comparison table\n",
+    "print('=' * 50)\n",
+    "print('MODEL COMPARISON — Purchase Prediction (AUC)')\n",
+    "print('=' * 50)\n",
+    "print(f'  LightGBM (tabular only):        {lgb_test_auc:.4f}')\n",
+    "print(f'  JointFusion (Transformer+DCNv2): {fusion_test_auc:.4f}')\n",
+    "print(f'  Difference:                      {fusion_test_auc - lgb_test_auc:+.4f}')\n",
+    "print('=' * 50)\n",
+    "\n",
+    "if fusion_test_auc > lgb_test_auc:\n",
+    "    print(f'\\n✅ JointFusion beats LightGBM by {(fusion_test_auc - lgb_test_auc)*100:.2f} percentage points')\n",
+    "else:\n",
+    "    print(f'\\n⚠️ LightGBM still leads by {(lgb_test_auc - fusion_test_auc)*100:.2f} percentage points')\n",
+    "    print(f'    (Expected with only 3-epoch pre-training. More epochs would improve the transformer embeddings.)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Loss curve\n",
+    "losses = [h['loss'] for h in trainer.state.log_history if 'loss' in h]\n",
+    "eval_losses = [h['eval_loss'] for h in trainer.state.log_history if 'eval_loss' in h]\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 5))\n",
+    "ax.plot(losses, label='Train Loss', alpha=0.7)\n",
+    "if eval_losses:\n",
+    "    eval_steps = np.linspace(0, len(losses), len(eval_losses))\n",
+    "    ax.plot(eval_steps, eval_losses, 'ro-', label='Eval Loss', markersize=4)\n",
+    "ax.set_xlabel('Step'); ax.set_ylabel('Loss'); ax.set_title('Fine-Tuning Loss')\n",
+    "ax.legend(); ax.grid(True, alpha=0.3); plt.tight_layout(); plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wandb.finish()\n",
+    "print('Done!')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "| Model | Test AUC | Notes |\n",
+    "|-------|----------|-------|\n",
+    "| LightGBM (tabular) | *see above* | 13 hand-crafted features |\n",
+    "| JointFusion (Transformer+DCNv2) | *see above* | Pre-trained domain tokens + same 13 features |\n",
+    "\n",
+    "The pre-trained DomainTransformer captures sequential behavioral patterns (view→cart→purchase funnels, category stickiness, temporal habits) that hand-crafted features cannot fully represent."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" },
+  "language_info": { "name": "python", "version": "3.12.0" }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}