#!/usr/bin/env python3 """ Train all three pg_plan_cache models: 1. SQL Cache Advisor (classification: low / medium / high) 2. Cache TTL Recommender (regression: seconds) 3. Query Complexity Estimator (regression: 1-100 score) Saves trained models as joblib files in the ./trained/ directory. """ import os import json import numpy as np from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import classification_report, mean_absolute_error, r2_score from sklearn.preprocessing import LabelEncoder import joblib from features import extract_features, FEATURE_NAMES from dataset import generate_dataset OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "trained") def train(): print("=" * 60) print(" pg_plan_cache — Model Training") print("=" * 60) # ── Generate data ───────────────────────────────────────── print("\n[1/5] Generating synthetic training data...") queries, benefits, ttls, complexities = generate_dataset(n=8000, seed=42) print(f" Generated {len(queries)} samples") # ── Extract features ────────────────────────────────────── print("\n[2/5] Extracting features...") X = np.array([extract_features(q) for q in queries]) print(f" Feature matrix: {X.shape}") # ── Encode labels ───────────────────────────────────────── le = LabelEncoder() y_benefit = le.fit_transform(benefits) # low=1, medium=2, high=0 y_ttl = np.array(ttls, dtype=float) y_complexity = np.array(complexities, dtype=float) # ── Split ───────────────────────────────────────────────── X_train, X_test, yb_train, yb_test, yt_train, yt_test, yc_train, yc_test = \ train_test_split(X, y_benefit, y_ttl, y_complexity, test_size=0.2, random_state=42) print(f" Train: {len(X_train)}, Test: {len(X_test)}") # ── Model 1: Cache Advisor (classification) ─────────────── print("\n[3/5] Training SQL Cache Advisor...") clf = RandomForestClassifier( n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1, ) clf.fit(X_train, yb_train) yb_pred = clf.predict(X_test) print("\n Classification Report:") report = classification_report(yb_test, yb_pred, target_names=le.classes_) print(" " + report.replace("\n", "\n ")) cv_scores = cross_val_score(clf, X, y_benefit, cv=5, scoring="accuracy") print(f" Cross-val accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})") # ── Model 2: TTL Recommender (regression) ───────────────── print("\n[4/5] Training Cache TTL Recommender...") reg_ttl = GradientBoostingRegressor( n_estimators=200, max_depth=8, learning_rate=0.1, min_samples_split=5, random_state=42, ) reg_ttl.fit(X_train, yt_train) yt_pred = reg_ttl.predict(X_test) mae_ttl = mean_absolute_error(yt_test, yt_pred) r2_ttl = r2_score(yt_test, yt_pred) print(f" MAE: {mae_ttl:.1f} seconds") print(f" R2: {r2_ttl:.3f}") # ── Model 3: Complexity Estimator (regression) ──────────── print("\n[5/5] Training Query Complexity Estimator...") reg_cplx = GradientBoostingRegressor( n_estimators=200, max_depth=8, learning_rate=0.1, min_samples_split=5, random_state=42, ) reg_cplx.fit(X_train, yc_train) yc_pred = reg_cplx.predict(X_test) mae_cplx = mean_absolute_error(yc_test, yc_pred) r2_cplx = r2_score(yc_test, yc_pred) print(f" MAE: {mae_cplx:.1f} points") print(f" R2: {r2_cplx:.3f}") # ── Save models ─────────────────────────────────────────── os.makedirs(OUTPUT_DIR, exist_ok=True) joblib.dump(clf, os.path.join(OUTPUT_DIR, "cache_advisor.joblib")) joblib.dump(reg_ttl, os.path.join(OUTPUT_DIR, "ttl_recommender.joblib")) joblib.dump(reg_cplx, os.path.join(OUTPUT_DIR, "complexity_estimator.joblib")) joblib.dump(le, os.path.join(OUTPUT_DIR, "label_encoder.joblib")) # Feature importances importances = { "cache_advisor": dict(zip(FEATURE_NAMES, clf.feature_importances_.tolist())), "ttl_recommender": dict(zip(FEATURE_NAMES, reg_ttl.feature_importances_.tolist())), "complexity_estimator": dict(zip(FEATURE_NAMES, reg_cplx.feature_importances_.tolist())), } with open(os.path.join(OUTPUT_DIR, "feature_importances.json"), "w") as f: json.dump(importances, f, indent=2) # Model metadata metadata = { "models": { "cache_advisor": { "type": "RandomForestClassifier", "task": "classification", "classes": le.classes_.tolist(), "accuracy_cv5": round(float(cv_scores.mean()), 4), }, "ttl_recommender": { "type": "GradientBoostingRegressor", "task": "regression", "unit": "seconds", "mae": round(float(mae_ttl), 2), "r2": round(float(r2_ttl), 4), }, "complexity_estimator": { "type": "GradientBoostingRegressor", "task": "regression", "unit": "score (1-100)", "mae": round(float(mae_cplx), 2), "r2": round(float(r2_cplx), 4), }, }, "features": FEATURE_NAMES, "n_features": len(FEATURE_NAMES), "training_samples": len(queries), "test_samples": len(X_test), } with open(os.path.join(OUTPUT_DIR, "metadata.json"), "w") as f: json.dump(metadata, f, indent=2) print(f"\n Models saved to {OUTPUT_DIR}/") print(" Files: cache_advisor.joblib, ttl_recommender.joblib,") print(" complexity_estimator.joblib, label_encoder.joblib,") print(" feature_importances.json, metadata.json") print("\nDone.") if __name__ == "__main__": train()