File size: 5,135 Bytes
945356f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | """
RTB Bidding Algorithm Comparison on Real Criteo Data
"""
import numpy as np
import pandas as pd
import json
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
print("="*60)
print("RTB BIDDING ON REAL CRITEO DATA")
print("="*60)
# Load Criteo_x4 dataset
ds = load_dataset("reczoo/Criteo_x4", split="train", streaming=True)
rows = []
for i, row in enumerate(ds):
if i >= 100000:
break
rows.append(row)
df = pd.DataFrame(rows)
print(f"Loaded {len(df)} rows, CTR: {df['Label'].mean():.4f}")
# Feature prep
sparse_cols = [f"C{i}" for i in range(1, 27)]
dense_cols = [f"I{i}" for i in range(1, 14)]
for col in dense_cols:
df[col] = df[col].fillna(df[col].median())
for col in sparse_cols:
df[col] = df[col].fillna("MISSING")
vocab = {v: i+1 for i, v in enumerate(df[col].unique())}
df[col] = df[col].map(vocab)
for col in dense_cols:
df[col] = (df[col] - df[col].mean()) / (df[col].std() + 1e-8)
feature_cols = dense_cols + sparse_cols
X = df[feature_cols].values
y = df['Label'].values
# Simulate market prices
np.random.seed(42)
mu_price = 1.0 + 0.02 * X[:, 0] + 0.01 * X[:, 1]
market_price = np.random.lognormal(mu_price, 0.6)
print(f"Market price mean: {market_price.mean():.2f}")
# Train/test split
train_idx, test_idx = train_test_split(range(len(df)), test_size=0.2, random_state=42)
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
price_train, price_test = market_price[train_idx], market_price[test_idx]
# CTR model
print("\nTraining CTR model...")
lr = LogisticRegression(max_iter=200, C=0.1)
lr.fit(X_train, y_train)
pctr = lr.predict_proba(X_test)[:, 1]
print(f"pCTR range: [{pctr.min():.4f}, {pctr.max():.4f}], mean: {pctr.mean():.4f}")
# Price model
print("Training price model...")
price_model = GradientBoostingRegressor(n_estimators=50, max_depth=4)
price_model.fit(X_train, price_train)
price_pred = price_model.predict(X_test)
print(f"Price prediction MAE: {np.mean(np.abs(price_pred - price_test)):.2f}")
# Bidding algorithms
VALUE_PER_CLICK = 50.0
class LinearBid:
def __init__(self, base, avg_pctr):
self.base = base; self.avg = avg_pctr
def bid(self, pctr, **kw):
return self.base * (pctr / self.avg)
class ORTB:
def __init__(self, lam, c):
self.lam = lam; self.c = c
def bid(self, pctr, **kw):
return np.sqrt((self.c / self.lam) * pctr + self.c**2) - self.c
class DualOGD:
def __init__(self, budget, T, vpc=50, eps=None):
self.B = budget; self.T = T; self.rho = budget / T
self.vpc = vpc; self.eps = eps or 1.0 / np.sqrt(T)
self.lam = 0.0; self.spent = 0.0; self.t = 0
def bid(self, pctr, win_prob, **kw):
self.t += 1
rem = self.B - self.spent
if rem <= 0: return 0.0
v = pctr * self.vpc
max_b = min(v * 2, rem)
if max_b <= 0: return 0.0
bids = np.linspace(0.5, max_b, 40)
rewards = [(v - b) * win_prob(b) - self.lam * b * win_prob(b) for b in bids]
return float(bids[np.argmax(rewards)])
def update(self, cost):
self.spent += cost
self.lam = max(0.0, self.lam - self.eps * (self.rho - cost))
class Threshold:
def __init__(self, th, val):
self.th = th; self.val = val
def bid(self, pctr, **kw):
return self.val if pctr > self.th else 0.0
def simulate(algo, pctr, prices, clicks, budget, T):
spent = 0.0
clicks_got = 0
imp = 0
for i in range(min(T, len(pctr))):
if spent >= budget: break
def wp(b):
if b <= 0: return 0.0
return 1.0 / (1.0 + np.exp(-(b - prices[i]) / (prices[i] * 0.5)))
if isinstance(algo, DualOGD):
b = algo.bid(pctr[i], wp)
else:
b = algo.bid(pctr[i])
if b >= prices[i] and spent + b <= budget:
spent += b; imp += 1; clicks_got += int(clicks[i])
if isinstance(algo, DualOGD):
algo.update(float(b) if b >= prices[i] else 0.0)
return {
'clicks': int(clicks_got),
'impressions': int(imp),
'spent': float(spent),
'budget': float(budget),
'ctr': float(clicks_got / max(imp, 1)),
'budget_used': float(spent / budget),
'cpc': float(spent / max(clicks_got, 1))
}
# Run comparison
budget = 5000; T = 10000
avg_pctr = float(pctr.mean())
algos = {
'Linear': LinearBid(20, avg_pctr),
'ORTB': ORTB(0.002, 8),
'DualOGD': DualOGD(budget, T, VALUE_PER_CLICK),
'Threshold': Threshold(0.3, 30)
}
print("\n" + "="*60)
print("BIDDING ALGORITHM COMPARISON ON REAL CRITEO DATA")
print("="*60)
results = {}
for name, algo in algos.items():
results[name] = simulate(algo, pctr, price_test, y_test, budget, T)
r = results[name]
print(f"{name:12} Clicks:{r['clicks']:4} CTR:{r['ctr']:.4f} Budget:{r['budget_used']:.2%} CPC:{r['cpc']:.2f}")
with open('results_real.json', 'w') as f:
json.dump(results, f, indent=2)
print("\nSaved to results_real.json")
|