{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 02 — E-Commerce Pre-Training: Domain Tokenizer on Real User Behavior Data\n", "\n", "**Goal:** Pre-train a 24M DomainTransformer on real e-commerce behavioral sequences (view → cart → purchase funnels) where sequential patterns actually exist.\n", "\n", "**Dataset:** [REES46 Multi-Category Store](https://huggingface.co/datasets/kevykibbz/ecommerce-behavior-data-from-multi-category-store_oct-nov_2019) — ~110M events, real user behavior.\n", "\n", "**Critical fix applied:** Uses Whitespace pre-tokenizer (not ByteLevel) to avoid the 42% UNK bug from the first run.\n", "\n", "**Hardware:** L4/T4 GPU. For machines with <64GB RAM, we subsample to 10M events." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !pip install datasets transformers torch accelerate tokenizers numpy pandas matplotlib scikit-learn wandb huggingface_hub" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import logging, time, pickle, os, sys, gc\n", "from datetime import datetime\n", "from collections import Counter\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import torch\n", "from datasets import load_dataset\n", "\n", "if os.path.exists('../src'): sys.path.insert(0, '../src')\n", "elif os.path.exists('src'): sys.path.insert(0, 'src')\n", "\n", "from domain_tokenizer import (\n", " DomainTokenizerBuilder, DomainTransformerConfig,\n", " DomainTransformerForCausalLM, prepare_clm_dataset, pretrain_domain_model,\n", ")\n", "from domain_tokenizer.schema import DomainSchema, FieldSpec, FieldType\n", "\n", "logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')\n", "print(f'torch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')\n", "if torch.cuda.is_available():\n", " print(f'GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# HuggingFace login — needed for push_to_hub\n", "from huggingface_hub import login\n", "login() # prompts for token or reads from ~/.cache/huggingface/token" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# wandb login — logs persist even if notebook disconnects\n", "import wandb\n", "wandb.login()\n", "os.environ['WANDB_PROJECT'] = 'domainTokenizer'\n", "print('wandb ready')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 1 — Load Dataset\n", "\n", "**IMPORTANT:** Full dataset is 110M rows (~25GB in RAM). If your machine has <64GB RAM, subsample at load time." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "ds = load_dataset(\n", " 'kevykibbz/ecommerce-behavior-data-from-multi-category-store_oct-nov_2019',\n", " split='train',\n", ")\n", "print(f'Full dataset: {len(ds):,} events')\n", "\n", "# Subsample to 10M events if RAM < 64GB (most machines)\n", "MAX_EVENTS_LOAD = 10_000_000\n", "if len(ds) > MAX_EVENTS_LOAD:\n", " ds = ds.shuffle(seed=42).select(range(MAX_EVENTS_LOAD))\n", " print(f'Subsampled to {len(ds):,} events (RAM-safe)')\n", "\n", "df = ds.to_pandas()\n", "del ds; gc.collect() # free the Arrow dataset\n", "print(f'DataFrame: {df.shape}, ~{df.memory_usage(deep=True).sum()/1e9:.1f}GB RAM')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 2 — Data Profiling" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Unique users: {df['user_id'].nunique():,}\")\n", "print(f\"Unique products: {df['product_id'].nunique():,}\")\n", "print(f\"Unique categories: {df['category_code'].nunique():,}\")\n", "print(f\"Unique brands: {df['brand'].nunique():,}\")\n", "print(f\"Price range: {df['price'].min():.2f} to {df['price'].max():.2f}\")\n", "print(f\"\\nEvent types:\\n{df['event_type'].value_counts().to_string()}\")\n", "print(f\"\\nCategory codes (top 15):\\n{df['category_code'].value_counts().head(15).to_string()}\")\n", "print(f\"\\nNull rates:\")\n", "for col in df.columns:\n", " null_pct = df[col].isnull().mean() * 100\n", " if null_pct > 0: print(f\" {col}: {null_pct:.1f}%\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "events_per_user = df.groupby('user_id').size()\n", "print(f\"Events/user: min={events_per_user.min()}, max={events_per_user.max()}, \"\n", " f\"mean={events_per_user.mean():.1f}, median={events_per_user.median():.0f}\")\n", "print(f\"Users with 10+ events: {(events_per_user >= 10).sum():,}\")\n", "print(f\"Users with 20+ events: {(events_per_user >= 20).sum():,}\")\n", "\n", "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n", "axes[0].hist(np.log10(df['price'].clip(lower=0.01)), bins=50, edgecolor='black', alpha=0.7)\n", "axes[0].set_xlabel('log10(Price)'); axes[0].set_title('Price Distribution')\n", "axes[1].hist(events_per_user.clip(upper=100), bins=50, edgecolor='black', alpha=0.7)\n", "axes[1].set_xlabel('Events/User'); axes[1].set_title('Events per User')\n", "df['event_type'].value_counts().plot(kind='barh', ax=axes[2])\n", "axes[2].set_xlabel('Count'); axes[2].set_title('Event Types')\n", "plt.tight_layout(); plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 3 — Sequential Entropy Check\n", "\n", "Verify learnable sequential patterns exist before committing GPU time." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample_users = df['user_id'].drop_duplicates().sample(min(50000, df['user_id'].nunique()), random_state=42)\n", "sample_df = df[df['user_id'].isin(sample_users)].sort_values(['user_id', 'event_time'])\n", "\n", "unigram = sample_df['event_type'].value_counts(normalize=True)\n", "H_unigram = -(unigram * np.log2(unigram)).sum()\n", "\n", "bigrams, prev_context = Counter(), Counter()\n", "for uid, group in sample_df.groupby('user_id'):\n", " events = group['event_type'].tolist()\n", " for i in range(1, len(events)):\n", " bigrams[(events[i-1], events[i])] += 1\n", " prev_context[events[i-1]] += 1\n", "\n", "H_conditional = 0\n", "total_bigrams = sum(bigrams.values())\n", "for (prev, curr), count in bigrams.items():\n", " p_joint = count / total_bigrams\n", " p_cond = count / prev_context[prev]\n", " H_conditional -= p_joint * np.log2(p_cond)\n", "\n", "mutual_info = H_unigram - H_conditional\n", "print(f'H(event_type): {H_unigram:.3f} bits')\n", "print(f'H(event_type | prev): {H_conditional:.3f} bits')\n", "print(f'Mutual info: {mutual_info:.3f} bits ({mutual_info/H_unigram*100:.1f}% predictability gain)')\n", "print(f'\\nNote: This only measures event_type transitions. The model also learns')\n", "print(f'category, price, and temporal patterns — much richer sequential structure.')\n", "\n", "print(f'\\nTransition probabilities:')\n", "for prev in sorted(unigram.index):\n", " trans = {curr: bigrams.get((prev, curr), 0) / prev_context[prev] \n", " for curr in sorted(unigram.index) if bigrams.get((prev, curr), 0) > 0}\n", " row = ' | '.join(f'{c}: {p:.2f}' for c, p in sorted(trans.items(), key=lambda x: -x[1]))\n", " print(f' After {prev:20s} → {row}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 4 — Schema & Event Conversion" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ECOMMERCE_REES46_SCHEMA = DomainSchema(\n", " name='ecommerce_rees46',\n", " description='REES46 e-commerce behavioral event schema',\n", " fields=[\n", " FieldSpec(name='event_type', field_type=FieldType.CATEGORICAL_FIXED, prefix='EVT',\n", " categories=['view', 'cart', 'remove_from_cart', 'purchase']),\n", " FieldSpec(name='price', field_type=FieldType.NUMERICAL_CONTINUOUS, prefix='PRICE', n_bins=21),\n", " FieldSpec(name='category', field_type=FieldType.TEXT, prefix='CAT'),\n", " FieldSpec(name='timestamp', field_type=FieldType.TEMPORAL, calendar_fields=['dow', 'hour']),\n", " ],\n", ")\n", "print(ECOMMERCE_REES46_SCHEMA.summary())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def row_to_event(row):\n", " dt = datetime.strptime(row['event_time'][:19], '%Y-%m-%dT%H:%M:%S')\n", " cat = row['category_code'] if pd.notna(row['category_code']) else (row['brand'] if pd.notna(row['brand']) else 'unknown')\n", " return {'event_type': row['event_type'], 'price': row['price'], 'category': cat, 'timestamp': dt}\n", "\n", "print(f'Sample: {row_to_event(df.iloc[0])}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "MIN_EVENTS = 10\n", "MAX_EVENTS = 200\n", "MAX_USERS = 100_000\n", "\n", "user_counts = df.groupby('user_id').size()\n", "eligible_users = user_counts[user_counts >= MIN_EVENTS].index\n", "print(f'Users with {MIN_EVENTS}+ events: {len(eligible_users):,}')\n", "\n", "if len(eligible_users) > MAX_USERS:\n", " eligible_users = pd.Series(eligible_users).sample(MAX_USERS, random_state=42).values\n", " print(f'Subsampled to {MAX_USERS:,} users')\n", "\n", "filtered_df = df[df['user_id'].isin(eligible_users)].sort_values(['user_id', 'event_time'])\n", "\n", "user_sequences, user_ids = [], []\n", "for uid, group in filtered_df.groupby('user_id'):\n", " user_sequences.append([row_to_event(row) for _, row in group.head(MAX_EVENTS).iterrows()])\n", " user_ids.append(uid)\n", "\n", "print(f'Users: {len(user_sequences):,}, Events: {sum(len(s) for s in user_sequences):,}')\n", "print(f'Events/user: min={min(len(s) for s in user_sequences)}, max={max(len(s) for s in user_sequences)}, mean={np.mean([len(s) for s in user_sequences]):.1f}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Free the big DataFrame — we only need user_sequences from here\n", "del df, filtered_df, user_counts, eligible_users\n", "gc.collect()\n", "print('DataFrame freed from memory')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 5 — Build Domain Tokenizer\n", "\n", "**Uses Whitespace pre-tokenizer** (fix for the 42% UNK bug from ByteLevel)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_events = [e for seq in user_sequences for e in seq]\n", "print(f'Total events for fitting: {len(all_events):,}')\n", "\n", "builder = DomainTokenizerBuilder(ECOMMERCE_REES46_SCHEMA)\n", "builder.fit(all_events)\n", "\n", "text_corpus = [e['category'] for e in all_events]\n", "unique_cats = sorted(set(text_corpus))\n", "print(f'Unique category strings: {len(unique_cats):,}')\n", "for c in unique_cats[:10]: print(f\" '{c}'\")\n", "if len(unique_cats) > 10: print(f' ... and {len(unique_cats)-10} more')\n", "\n", "hf_tokenizer = builder.build(text_corpus=text_corpus, bpe_vocab_size=4000)\n", "print(f'\\nVocab size: {hf_tokenizer.vocab_size}')\n", "\n", "del all_events, text_corpus; gc.collect() # free fitting data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Verify UNK rate is now ~0% (was 42% with ByteLevel bug)\n", "print('--- Sample event ---')\n", "for i, t in enumerate(builder.tokenize_event(user_sequences[0][0])): print(f' [{i}] {t}')\n", "\n", "seq_tokens = builder.tokenize_sequence(user_sequences[0][:3])\n", "print(f'\\n--- First 3 events ({len(seq_tokens)} token strings) ---')\n", "for i, t in enumerate(seq_tokens): print(f' [{i:3d}] {t}')\n", "\n", "seq_ids = hf_tokenizer(' '.join(seq_tokens), add_special_tokens=False)['input_ids']\n", "unk_count = sum(1 for i in seq_ids if i == hf_tokenizer.unk_token_id)\n", "print(f'\\n✅ UNK rate: {unk_count}/{len(seq_ids)} ({unk_count/max(len(seq_ids),1)*100:.1f}%) — should be ~0%')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 6 — Pack and Train" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "BLOCK_SIZE = 512\n", "dataset = prepare_clm_dataset(user_sequences, builder, hf_tokenizer, block_size=BLOCK_SIZE)\n", "print(f'Packed: {len(dataset):,} blocks x {BLOCK_SIZE} = {len(dataset)*BLOCK_SIZE:,} training tokens')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f'Sample block decoded:')\n", "print(hf_tokenizer.decode(dataset[0]['input_ids'][:40]))\n", "\n", "# Quick UNK check on full dataset\n", "sample_ids = dataset[0]['input_ids'] + dataset[len(dataset)//2]['input_ids'] + dataset[-1]['input_ids']\n", "unk_in_sample = sum(1 for i in sample_ids if i == hf_tokenizer.unk_token_id)\n", "print(f'\\nUNK in 3 sample blocks: {unk_in_sample}/{len(sample_ids)} ({unk_in_sample/len(sample_ids)*100:.2f}%)')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "config = DomainTransformerConfig.from_preset('24m', vocab_size=hf_tokenizer.vocab_size)\n", "model = DomainTransformerForCausalLM(config)\n", "print(f'Model: {sum(p.numel() for p in model.parameters()):,} params | d={config.hidden_size}, L={config.num_hidden_layers}, H={config.num_attention_heads}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "USE_GPU = torch.cuda.is_available()\n", "# T4 doesn't support bf16 — use fp16 instead\n", "GPU_NAME = torch.cuda.get_device_name(0) if USE_GPU else ''\n", "USE_BF16 = USE_GPU and 'T4' not in GPU_NAME # L4, A100 support bf16\n", "USE_FP16 = USE_GPU and not USE_BF16 # T4 uses fp16\n", "\n", "print(f'Precision: {\"bf16\" if USE_BF16 else \"fp16\" if USE_FP16 else \"fp32\"}')\n", "\n", "trainer = pretrain_domain_model(\n", " model=model,\n", " tokenizer=hf_tokenizer,\n", " train_dataset=dataset,\n", " output_dir='./ecommerce_pretrain_checkpoints',\n", " hub_model_id='rtferraz/ecommerce-domain-24m',\n", " num_epochs=3 if USE_GPU else 1,\n", " per_device_batch_size=32 if USE_GPU else 4,\n", " gradient_accumulation_steps=4 if USE_GPU else 1,\n", " learning_rate=3e-4,\n", " warmup_steps=200 if USE_GPU else 10,\n", " logging_steps=50 if USE_GPU else 10,\n", " save_steps=2000 if USE_GPU else 999999,\n", " bf16=USE_BF16,\n", " fp16=USE_FP16,\n", " report_to='wandb',\n", " run_name='ecommerce-pretrain-24m-3ep',\n", " seed=42,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 7 — Results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "losses = [h['loss'] for h in trainer.state.log_history if 'loss' in h]\n", "print(f'Steps: {trainer.state.global_step:,}')\n", "print(f'Loss: {losses[0]:.4f} -> {losses[-1]:.4f} ({(1-losses[-1]/losses[0])*100:.1f}% reduction)')\n", "print(f'Min loss: {min(losses):.4f}')\n", "\n", "random_loss = np.log(hf_tokenizer.vocab_size)\n", "print(f'Random chance loss: {random_loss:.4f}')\n", "print(f'Model vs random: {\"✅ Better\" if losses[-1] < random_loss else \"❌ Worse\"} ({losses[-1]:.2f} vs {random_loss:.2f})')\n", "\n", "fig, ax = plt.subplots(figsize=(10, 5))\n", "ax.plot(losses, linewidth=0.5, alpha=0.5, label='Per-step')\n", "window = max(len(losses) // 50, 1)\n", "if len(losses) > window:\n", " ax.plot(pd.Series(losses).rolling(window=window, min_periods=1).mean(), linewidth=2, color='red', label='Smoothed')\n", "ax.axhline(y=random_loss, color='gray', linestyle='--', label=f'Random ({random_loss:.2f})')\n", "ax.set_xlabel('Step'); ax.set_ylabel('Loss'); ax.set_title('E-Commerce Pre-Training Loss')\n", "ax.legend(); ax.grid(True, alpha=0.3); plt.tight_layout(); plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.eval()\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "model = model.to(device)\n", "\n", "test_ids = hf_tokenizer(' '.join(builder.tokenize_sequence(user_sequences[0][:5])),\n", " return_tensors='pt', add_special_tokens=False)['input_ids'].to(device)\n", "with torch.no_grad():\n", " top5 = torch.topk(model(input_ids=test_ids).logits[0, -1, :], 5)\n", "\n", "print('Last 5 input tokens:')\n", "for tid in test_ids[0, -5:]: print(f\" {tid.item():5d} -> '{hf_tokenizer.decode([tid.item()])}'\")\n", "print('\\nTop-5 next token predictions:')\n", "for score, tid in zip(top5.values, top5.indices): print(f\" {tid.item():5d} -> '{hf_tokenizer.decode([tid.item()])}' (score={score.item():.3f})\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# User embeddings — behavioral clusters\n", "n_sample = min(500, len(user_sequences))\n", "embeddings, purchase_rates = [], []\n", "\n", "for i in range(n_sample):\n", " enc = hf_tokenizer(' '.join(builder.tokenize_sequence(user_sequences[i][:50])),\n", " return_tensors='pt', add_special_tokens=False, max_length=256, truncation=True, padding='max_length')\n", " with torch.no_grad():\n", " embeddings.append(model.get_user_embedding(enc['input_ids'].to(device), enc['attention_mask'].to(device)).cpu().numpy().flatten())\n", " events = user_sequences[i]\n", " purchase_rates.append(sum(1 for e in events if e['event_type'] == 'purchase') / len(events))\n", "\n", "embeddings = np.array(embeddings); purchase_rates = np.array(purchase_rates)\n", "print(f'Embeddings: {embeddings.shape}')\n", "\n", "from sklearn.manifold import TSNE\n", "coords = TSNE(n_components=2, random_state=42, perplexity=30).fit_transform(embeddings)\n", "fig, ax = plt.subplots(figsize=(8, 6))\n", "sc = ax.scatter(coords[:, 0], coords[:, 1], c=purchase_rates, cmap='RdYlGn', alpha=0.6, s=20, edgecolors='black', linewidth=0.2)\n", "ax.set_title('User Embeddings (t-SNE) — Colored by Purchase Rate')\n", "plt.colorbar(sc, label='Purchase Rate'); plt.tight_layout(); plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hf_tokenizer.save_pretrained('./ecommerce_tokenizer')\n", "builder.save('./ecommerce_tokenizer')\n", "model.save_pretrained('./ecommerce_pretrain_checkpoints/final')\n", "with open('./ecommerce_artifacts.pkl', 'wb') as f:\n", " pickle.dump({'user_sequences': user_sequences, 'user_ids': user_ids}, f)\n", "print('Saved all artifacts')\n", "wandb.finish()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 4 }