Upload pg_plan_cache models

Browse files

Files changed (12) hide show

README.md +88 -0
dataset.py +380 -0
features.py +172 -0
predict.py +137 -0
requirements.txt +4 -0
train.py +164 -0
trained/cache_advisor.joblib +3 -0
trained/complexity_estimator.joblib +3 -0
trained/feature_importances.json +92 -0
trained/label_encoder.joblib +3 -0
trained/metadata.json +61 -0
trained/ttl_recommender.joblib +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+library_name: sklearn
+tags:
+  - postgresql
+  - sql
+  - query-cache
+  - plan-cache
+  - redis
+  - database
+  - tabular-classification
+  - tabular-regression
+pipeline_tag: tabular-classification
+license: mit
+---
+# pg_plan_cache Models
+Three machine learning models for the **pg_plan_cache** PostgreSQL extension — a query
+execution plan cache backed by Redis.
+## Models
+### 1. SQL Cache Advisor
+- **Task:** Classification (high / medium / low)
+- **Algorithm:** Random Forest (200 trees)
+- **Purpose:** Predicts whether caching a query's execution plan will be beneficial
+### 2. Cache TTL Recommender
+- **Task:** Regression (seconds)
+- **Algorithm:** Gradient Boosting
+- **Purpose:** Recommends optimal cache TTL based on query characteristics
+### 3. Query Complexity Estimator
+- **Task:** Regression (1-100 score)
+- **Algorithm:** Gradient Boosting
+- **Purpose:** Estimates query complexity to prioritize caching resources
+## Features
+All models use 28 structural features extracted from raw SQL text:
+| Feature | Description |
+|---------|------------|
+| `query_length` | Character count |
+| `query_type` | SELECT=0, INSERT=1, UPDATE=2, DELETE=3 |
+| `num_tables` | Tables referenced |
+| `num_joins` | JOIN clause count |
+| `num_conditions` | AND/OR conditions |
+| `num_aggregates` | Aggregate function count |
+| `num_subqueries` | Subquery count |
+| `has_window_func` | Window functions present |
+| `has_cte` | Common Table Expressions |
+| `nesting_depth` | Max parenthesis depth |
+| ... | 18 more features |
+## Usage
+```python
+from predict import predict, format_prediction
+result = predict("SELECT u.name, COUNT(o.id) FROM users u JOIN orders o ON u.id = o.user_id GROUP BY u.name")
+print(format_prediction(result))
+# Cache Benefit: HIGH
+# Recommended TTL: 4200s (1h 10m)
+# Complexity: 62/100 (complex)
+```
+## Training
+Trained on 8,000 synthetic SQL queries across 18 complexity tiers:
+- Simple SELECTs, filtered queries, ORDER BY
+- Single and multi-table JOINs
+- Aggregations with GROUP BY / HAVING
+- Subqueries, correlated subqueries, EXISTS
+- CTEs, window functions, UNION
+- Complex analytics queries
+- INSERT / UPDATE / DELETE (non-cacheable)
+```bash
+pip install -r requirements.txt
+python train.py
+```
+## About pg_plan_cache
+pg_plan_cache is a PostgreSQL extension that caches query execution plans in Redis.
+It hooks into the PostgreSQL planner, normalizes queries, computes SHA-256 hashes,
+and stores serialized plans with configurable TTL and automatic schema-change invalidation.

dataset.py ADDED Viewed

	@@ -0,0 +1,380 @@

+"""
+Synthetic training data generator for pg_plan_cache models.
+Generates realistic SQL queries across a wide range of complexity levels
+with labels for cache benefit, recommended TTL, and complexity score.
+"""
+import random
+# ---------------------------------------------------------------------------
+# Building blocks
+# ---------------------------------------------------------------------------
+TABLES = [
+    "users", "orders", "products", "payments", "sessions",
+    "logs", "events", "accounts", "invoices", "shipments",
+    "categories", "reviews", "inventory", "notifications", "messages",
+    "employees", "departments", "projects", "tasks", "comments",
+]
+SCHEMAS = ["public", "app", "analytics", "billing"]
+COLUMNS = {
+    "users": ["id", "name", "email", "created_at", "status", "age", "country"],
+    "orders": ["id", "user_id", "total", "status", "created_at", "shipped_at"],
+    "products": ["id", "name", "price", "category_id", "stock", "rating"],
+    "payments": ["id", "order_id", "amount", "method", "paid_at", "status"],
+    "sessions": ["id", "user_id", "started_at", "ended_at", "ip_address"],
+    "logs": ["id", "level", "message", "created_at", "source"],
+    "events": ["id", "type", "user_id", "data", "created_at"],
+    "accounts": ["id", "owner_id", "balance", "currency", "opened_at"],
+    "invoices": ["id", "account_id", "amount", "due_date", "status"],
+    "shipments": ["id", "order_id", "carrier", "tracking", "shipped_at"],
+    "categories": ["id", "name", "parent_id", "sort_order"],
+    "reviews": ["id", "product_id", "user_id", "rating", "body", "created_at"],
+    "inventory": ["id", "product_id", "warehouse_id", "quantity", "updated_at"],
+    "notifications": ["id", "user_id", "type", "read", "created_at"],
+    "messages": ["id", "sender_id", "receiver_id", "body", "sent_at"],
+    "employees": ["id", "name", "department_id", "salary", "hired_at"],
+    "departments": ["id", "name", "budget", "manager_id"],
+    "projects": ["id", "name", "department_id", "deadline", "status"],
+    "tasks": ["id", "project_id", "assignee_id", "title", "status", "due_date"],
+    "comments": ["id", "task_id", "user_id", "body", "created_at"],
+}
+AGG_FUNCS = ["COUNT", "SUM", "AVG", "MIN", "MAX"]
+COMPARISONS = ["=", ">", "<", ">=", "<=", "!="]
+STRING_VALS = ["'active'", "'pending'", "'completed'", "'cancelled'", "'new'", "'shipped'"]
+JOIN_TYPES = ["JOIN", "LEFT JOIN", "INNER JOIN", "RIGHT JOIN"]
+WINDOW_FUNCS = ["ROW_NUMBER()", "RANK()", "DENSE_RANK()", "LAG(t.id, 1)", "LEAD(t.id, 1)"]
+def _rand_table():
+    return random.choice(TABLES)
+def _rand_cols(table, n=None):
+    cols = COLUMNS.get(table, ["id", "name"])
+    n = n or random.randint(1, min(4, len(cols)))
+    return random.sample(cols, min(n, len(cols)))
+def _rand_where(alias="t"):
+    col = random.choice(["id", "status", "created_at", "name", "amount", "age"])
+    op = random.choice(COMPARISONS)
+    if col == "status":
+        return f"{alias}.{col} {op} {random.choice(STRING_VALS)}"
+    elif col in ("id", "age", "amount"):
+        return f"{alias}.{col} {op} {random.randint(1, 10000)}"
+    else:
+        return f"{alias}.{col} {op} '2024-{random.randint(1,12):02d}-{random.randint(1,28):02d}'"
+# ---------------------------------------------------------------------------
+# Query generators by complexity tier
+# ---------------------------------------------------------------------------
+def _simple_select():
+    """Tier 1: Simple SELECT with optional WHERE."""
+    t = _rand_table()
+    cols = ", ".join(_rand_cols(t))
+    sql = f"SELECT {cols} FROM {t}"
+    if random.random() > 0.3:
+        sql += f" WHERE {_rand_where(t[:1])}"
+    if random.random() > 0.7:
+        sql += f" LIMIT {random.choice([10, 20, 50, 100])}"
+    return sql, "low", random.randint(300, 900), random.randint(5, 20)
+def _select_with_order():
+    """Tier 1.5: SELECT with ORDER BY and LIMIT."""
+    t = _rand_table()
+    cols = ", ".join(_rand_cols(t))
+    order_col = random.choice(COLUMNS.get(t, ["id"]))
+    direction = random.choice(["ASC", "DESC"])
+    sql = f"SELECT {cols} FROM {t} WHERE {_rand_where(t[:1])} ORDER BY {order_col} {direction} LIMIT {random.choice([10,25,50])}"
+    return sql, "low", random.randint(600, 1200), random.randint(10, 25)
+def _single_join():
+    """Tier 2: Single JOIN query."""
+    t1, t2 = random.sample(TABLES, 2)
+    c1 = ", ".join(f"a.{c}" for c in _rand_cols(t1, 2))
+    c2 = ", ".join(f"b.{c}" for c in _rand_cols(t2, 2))
+    jtype = random.choice(JOIN_TYPES)
+    sql = (
+        f"SELECT {c1}, {c2} FROM {t1} a "
+        f"{jtype} {t2} b ON a.id = b.{t1[:-1]}_id"
+    )
+    if random.random() > 0.4:
+        sql += f" WHERE {_rand_where('a')}"
+    return sql, "medium", random.randint(1800, 3600), random.randint(25, 45)
+def _multi_join():
+    """Tier 3: Multi-table JOIN."""
+    tables = random.sample(TABLES, random.randint(3, 5))
+    selects = []
+    for i, t in enumerate(tables):
+        alias = chr(97 + i)
+        col = random.choice(COLUMNS.get(t, ["id"]))
+        selects.append(f"{alias}.{col}")
+    sql = f"SELECT {', '.join(selects)} FROM {tables[0]} a"
+    for i in range(1, len(tables)):
+        alias = chr(97 + i)
+        prev_alias = chr(97 + i - 1)
+        jtype = random.choice(JOIN_TYPES)
+        sql += f" {jtype} {tables[i]} {alias} ON {prev_alias}.id = {alias}.{tables[i-1][:-1]}_id"
+    if random.random() > 0.3:
+        sql += f" WHERE {_rand_where('a')}"
+    if random.random() > 0.5:
+        sql += f" ORDER BY a.id LIMIT {random.choice([50, 100, 200])}"
+    return sql, "high", random.randint(3600, 7200), random.randint(45, 70)
+def _aggregate_query():
+    """Tier 3: Aggregation with GROUP BY."""
+    t = _rand_table()
+    group_col = random.choice(COLUMNS.get(t, ["id"])[:3])
+    agg = random.choice(AGG_FUNCS)
+    agg_col = random.choice(["id", "amount", "total", "price", "salary"])
+    sql = f"SELECT {group_col}, {agg}({agg_col}) FROM {t}"
+    if random.random() > 0.4:
+        sql += f" WHERE {_rand_where(t[:1])}"
+    sql += f" GROUP BY {group_col}"
+    if random.random() > 0.6:
+        sql += f" HAVING {agg}({agg_col}) > {random.randint(1, 1000)}"
+    if random.random() > 0.5:
+        sql += f" ORDER BY {agg}({agg_col}) DESC"
+    return sql, "high", random.randint(3600, 7200), random.randint(40, 65)
+def _aggregate_join():
+    """Tier 4: JOIN + Aggregation."""
+    t1, t2 = random.sample(TABLES, 2)
+    agg = random.choice(AGG_FUNCS)
+    group_col = f"a.{random.choice(COLUMNS.get(t1, ['id'])[:2])}"
+    agg_col = f"b.{random.choice(['id', 'amount', 'total'])}"
+    jtype = random.choice(JOIN_TYPES)
+    sql = (
+        f"SELECT {group_col}, {agg}({agg_col}) as agg_val "
+        f"FROM {t1} a {jtype} {t2} b ON a.id = b.{t1[:-1]}_id "
+        f"WHERE {_rand_where('a')} "
+        f"GROUP BY {group_col}"
+    )
+    if random.random() > 0.5:
+        sql += f" HAVING {agg}({agg_col}) > {random.randint(1, 500)}"
+    sql += f" ORDER BY agg_val DESC LIMIT {random.choice([10, 20, 50])}"
+    return sql, "high", random.randint(3600, 7200), random.randint(55, 80)
+def _subquery():
+    """Tier 4: Subquery."""
+    t1, t2 = random.sample(TABLES, 2)
+    cols = ", ".join(_rand_cols(t1, 2))
+    sub_agg = random.choice(AGG_FUNCS)
+    op = random.choice([">", "<", ">="])
+    sql = (
+        f"SELECT {cols} FROM {t1} "
+        f"WHERE id IN (SELECT {t1[:-1]}_id FROM {t2} "
+        f"WHERE {_rand_where(t2[:1])})"
+    )
+    return sql, "high", random.randint(3600, 5400), random.randint(50, 75)
+def _correlated_subquery():
+    """Tier 5: Correlated subquery."""
+    t1, t2 = random.sample(TABLES, 2)
+    agg = random.choice(AGG_FUNCS)
+    sql = (
+        f"SELECT a.id, a.name, "
+        f"(SELECT {agg}(b.id) FROM {t2} b WHERE b.{t1[:-1]}_id = a.id) as sub_val "
+        f"FROM {t1} a WHERE {_rand_where('a')}"
+    )
+    return sql, "high", random.randint(3600, 7200), random.randint(60, 85)
+def _cte_query():
+    """Tier 5: Common Table Expression (WITH)."""
+    t1, t2 = random.sample(TABLES, 2)
+    agg = random.choice(AGG_FUNCS)
+    sql = (
+        f"WITH cte AS ("
+        f"SELECT {t1[:-1]}_id, {agg}(id) as cnt FROM {t2} GROUP BY {t1[:-1]}_id"
+        f") SELECT a.id, a.name, c.cnt "
+        f"FROM {t1} a JOIN cte c ON a.id = c.{t1[:-1]}_id "
+        f"WHERE c.cnt > {random.randint(1, 50)} "
+        f"ORDER BY c.cnt DESC"
+    )
+    return sql, "high", random.randint(3600, 7200), random.randint(65, 85)
+def _window_query():
+    """Tier 5: Window function."""
+    t = _rand_table()
+    wfunc = random.choice(["ROW_NUMBER()", "RANK()", "DENSE_RANK()"])
+    partition_col = random.choice(COLUMNS.get(t, ["id"])[:2])
+    order_col = random.choice(["id", "created_at"])
+    sql = (
+        f"SELECT id, {partition_col}, "
+        f"{wfunc} OVER (PARTITION BY {partition_col} ORDER BY {order_col} DESC) as rn "
+        f"FROM {t} WHERE {_rand_where(t[:1])}"
+    )
+    return sql, "high", random.randint(3600, 7200), random.randint(55, 80)
+def _union_query():
+    """Tier 4: UNION query."""
+    t1, t2 = random.sample(TABLES, 2)
+    sql = (
+        f"SELECT id, name FROM {t1} WHERE {_rand_where(t1[:1])} "
+        f"UNION ALL "
+        f"SELECT id, name FROM {t2} WHERE {_rand_where(t2[:1])}"
+    )
+    return sql, "medium", random.randint(1800, 3600), random.randint(35, 55)
+def _complex_analytics():
+    """Tier 6: Complex analytics query."""
+    t1, t2, t3 = random.sample(TABLES, 3)
+    agg1 = random.choice(AGG_FUNCS)
+    agg2 = random.choice(AGG_FUNCS)
+    sql = (
+        f"WITH monthly AS ("
+        f"SELECT a.id, a.name, {agg1}(b.id) as cnt, {agg2}(c.id) as total "
+        f"FROM {t1} a "
+        f"LEFT JOIN {t2} b ON a.id = b.{t1[:-1]}_id "
+        f"LEFT JOIN {t3} c ON b.id = c.{t2[:-1]}_id "
+        f"WHERE a.created_at >= '2024-01-01' "
+        f"GROUP BY a.id, a.name "
+        f"HAVING {agg1}(b.id) > {random.randint(1, 20)}"
+        f") SELECT name, cnt, total, "
+        f"RANK() OVER (ORDER BY cnt DESC) as rank "
+        f"FROM monthly ORDER BY rank LIMIT 100"
+    )
+    return sql, "high", random.randint(5400, 7200), random.randint(80, 100)
+def _insert_query():
+    """INSERT — not cacheable."""
+    t = _rand_table()
+    cols = _rand_cols(t, 3)
+    vals = ", ".join(
+        f"{random.randint(1, 9999)}" if c in ("id", "age") else f"'val_{random.randint(1,99)}'"
+        for c in cols
+    )
+    sql = f"INSERT INTO {t} ({', '.join(cols)}) VALUES ({vals})"
+    return sql, "low", 0, random.randint(5, 15)
+def _update_query():
+    """UPDATE — not cacheable."""
+    t = _rand_table()
+    col = random.choice(COLUMNS.get(t, ["name"])[1:])
+    sql = f"UPDATE {t} SET {col} = 'updated' WHERE {_rand_where(t[:1])}"
+    return sql, "low", 0, random.randint(5, 15)
+def _delete_query():
+    """DELETE — not cacheable."""
+    t = _rand_table()
+    sql = f"DELETE FROM {t} WHERE {_rand_where(t[:1])}"
+    return sql, "low", 0, random.randint(5, 10)
+def _exists_query():
+    """Tier 4: EXISTS subquery."""
+    t1, t2 = random.sample(TABLES, 2)
+    cols = ", ".join(_rand_cols(t1, 2))
+    sql = (
+        f"SELECT {cols} FROM {t1} a "
+        f"WHERE EXISTS (SELECT 1 FROM {t2} b WHERE b.{t1[:-1]}_id = a.id "
+        f"AND {_rand_where('b')})"
+    )
+    return sql, "high", random.randint(3600, 5400), random.randint(50, 70)
+def _case_query():
+    """Tier 3: CASE expression."""
+    t = _rand_table()
+    sql = (
+        f"SELECT id, "
+        f"CASE WHEN status = 'active' THEN 'A' "
+        f"WHEN status = 'pending' THEN 'P' "
+        f"ELSE 'X' END as status_code, "
+        f"name FROM {t} WHERE {_rand_where(t[:1])}"
+    )
+    return sql, "medium", random.randint(1800, 3600), random.randint(25, 40)
+def _distinct_query():
+    """Tier 2: SELECT DISTINCT."""
+    t = _rand_table()
+    col = random.choice(COLUMNS.get(t, ["name"])[:3])
+    sql = f"SELECT DISTINCT {col} FROM {t} WHERE {_rand_where(t[:1])} ORDER BY {col}"
+    return sql, "medium", random.randint(1200, 2400), random.randint(20, 35)
+# ---------------------------------------------------------------------------
+# Generator registry
+# ---------------------------------------------------------------------------
+GENERATORS = [
+    (_simple_select, 15),
+    (_select_with_order, 10),
+    (_single_join, 12),
+    (_multi_join, 8),
+    (_aggregate_query, 10),
+    (_aggregate_join, 8),
+    (_subquery, 7),
+    (_correlated_subquery, 5),
+    (_cte_query, 5),
+    (_window_query, 5),
+    (_union_query, 4),
+    (_complex_analytics, 3),
+    (_insert_query, 8),
+    (_update_query, 5),
+    (_delete_query, 4),
+    (_exists_query, 5),
+    (_case_query, 4),
+    (_distinct_query, 4),
+]
+# Build weighted list
+_WEIGHTED = []
+for gen, weight in GENERATORS:
+    _WEIGHTED.extend([gen] * weight)
+def generate_sample():
+    """Generate one (sql, cache_benefit, ttl, complexity) sample."""
+    gen = random.choice(_WEIGHTED)
+    sql, benefit, ttl, complexity = gen()
+    # Add slight noise to TTL and complexity
+    ttl = max(0, ttl + random.randint(-60, 60))
+    complexity = max(1, min(100, complexity + random.randint(-3, 3)))
+    return sql, benefit, ttl, complexity
+def generate_dataset(n: int = 5000, seed: int = 42):
+    """
+    Generate a training dataset of n samples.
+    Returns:
+        queries: list[str]
+        benefits: list[str]    — "low", "medium", "high"
+        ttls: list[int]        — recommended TTL in seconds
+        complexities: list[int] — 1-100 complexity score
+    """
+    random.seed(seed)
+    queries, benefits, ttls, complexities = [], [], [], []
+    for _ in range(n):
+        sql, benefit, ttl, complexity = generate_sample()
+        queries.append(sql)
+        benefits.append(benefit)
+        ttls.append(ttl)
+        complexities.append(complexity)
+    return queries, benefits, ttls, complexities

features.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+SQL feature extraction for pg_plan_cache models.
+Extracts structural features from raw SQL query text to feed into
+the Cache Advisor, TTL Recommender, and Complexity Estimator models.
+"""
+import re
+AGGREGATE_FUNCS = re.compile(
+    r"\b(count|sum|avg|min|max|array_agg|string_agg|bool_and|bool_or|jsonb_agg)\s*\(",
+    re.IGNORECASE,
+)
+WINDOW_FUNCS = re.compile(
+    r"\b(row_number|rank|dense_rank|ntile|lag|lead|first_value|last_value|nth_value)\s*\(",
+    re.IGNORECASE,
+)
+JOIN_PATTERN = re.compile(
+    r"\b(inner\s+join|left\s+join|right\s+join|full\s+join|cross\s+join|join)\b",
+    re.IGNORECASE,
+)
+SUBQUERY_PATTERN = re.compile(r"\(\s*select\b", re.IGNORECASE)
+CTE_PATTERN = re.compile(r"\bwith\s+\w+\s+as\s*\(", re.IGNORECASE)
+UNION_PATTERN = re.compile(r"\b(union|intersect|except)\b", re.IGNORECASE)
+CASE_PATTERN = re.compile(r"\bcase\b", re.IGNORECASE)
+IN_PATTERN = re.compile(r"\bin\s*\(", re.IGNORECASE)
+LIKE_PATTERN = re.compile(r"\b(like|ilike)\b", re.IGNORECASE)
+BETWEEN_PATTERN = re.compile(r"\bbetween\b", re.IGNORECASE)
+EXISTS_PATTERN = re.compile(r"\bexists\s*\(", re.IGNORECASE)
+HAVING_PATTERN = re.compile(r"\bhaving\b", re.IGNORECASE)
+CAST_PATTERN = re.compile(r"\b(cast|::)\b", re.IGNORECASE)
+FEATURE_NAMES = [
+    "query_length",
+    "query_type",           # 0=SELECT, 1=INSERT, 2=UPDATE, 3=DELETE, 4=OTHER
+    "num_tables",
+    "num_joins",
+    "num_conditions",
+    "num_aggregates",
+    "num_subqueries",
+    "num_columns",
+    "has_distinct",
+    "has_order_by",
+    "has_group_by",
+    "has_having",
+    "has_limit",
+    "has_offset",
+    "has_where",
+    "has_like",
+    "has_in_clause",
+    "has_between",
+    "has_exists",
+    "has_window_func",
+    "has_cte",
+    "has_union",
+    "has_case",
+    "has_cast",
+    "nesting_depth",
+    "num_and_or",
+    "num_string_literals",
+    "num_numeric_literals",
+]
+def _count_tables(sql: str) -> int:
+    """Estimate the number of tables referenced."""
+    count = 0
+    # FROM clause tables
+    from_match = re.search(r"\bfrom\s+(.+?)(?:\bwhere\b|\bjoin\b|\bgroup\b|\border\b|\blimit\b|\bhaving\b|;|$)", sql, re.IGNORECASE | re.DOTALL)
+    if from_match:
+        from_clause = from_match.group(1)
+        count += len(re.split(r",", from_clause))
+    # JOIN tables
+    count += len(JOIN_PATTERN.findall(sql))
+    return max(count, 0)
+def _count_columns(sql: str) -> int:
+    """Estimate the number of columns in SELECT clause."""
+    match = re.search(r"\bselect\s+(.*?)\bfrom\b", sql, re.IGNORECASE | re.DOTALL)
+    if not match:
+        return 0
+    select_clause = match.group(1).strip()
+    if select_clause == "*":
+        return 1
+    # Split by commas not inside parentheses
+    depth = 0
+    count = 1
+    for ch in select_clause:
+        if ch == '(':
+            depth += 1
+        elif ch == ')':
+            depth -= 1
+        elif ch == ',' and depth == 0:
+            count += 1
+    return count
+def _nesting_depth(sql: str) -> int:
+    """Calculate maximum parenthesis nesting depth."""
+    max_depth = 0
+    depth = 0
+    for ch in sql:
+        if ch == '(':
+            depth += 1
+            max_depth = max(max_depth, depth)
+        elif ch == ')':
+            depth -= 1
+    return max_depth
+def extract_features(sql: str) -> list[float]:
+    """
+    Extract a fixed-length feature vector from a SQL query string.
+    Returns a list of floats matching FEATURE_NAMES ordering.
+    """
+    sql = sql.strip()
+    upper = sql.upper().lstrip()
+    # Query type
+    if upper.startswith("SELECT"):
+        qtype = 0
+    elif upper.startswith("INSERT"):
+        qtype = 1
+    elif upper.startswith("UPDATE"):
+        qtype = 2
+    elif upper.startswith("DELETE"):
+        qtype = 3
+    else:
+        qtype = 4
+    num_joins = len(JOIN_PATTERN.findall(sql))
+    num_aggs = len(AGGREGATE_FUNCS.findall(sql))
+    num_subqueries = len(SUBQUERY_PATTERN.findall(sql))
+    num_conditions = len(re.findall(r"\b(and|or)\b", sql, re.IGNORECASE))
+    num_string_lits = len(re.findall(r"'[^']*'", sql))
+    num_numeric_lits = len(re.findall(r"\b\d+(?:\.\d+)?\b", sql))
+    features = [
+        float(len(sql)),                                        # query_length
+        float(qtype),                                           # query_type
+        float(_count_tables(sql)),                              # num_tables
+        float(num_joins),                                       # num_joins
+        float(num_conditions),                                  # num_conditions
+        float(num_aggs),                                        # num_aggregates
+        float(num_subqueries),                                  # num_subqueries
+        float(_count_columns(sql)),                             # num_columns
+        float(bool(re.search(r"\bdistinct\b", sql, re.I))),    # has_distinct
+        float(bool(re.search(r"\border\s+by\b", sql, re.I))),  # has_order_by
+        float(bool(re.search(r"\bgroup\s+by\b", sql, re.I))),  # has_group_by
+        float(bool(HAVING_PATTERN.search(sql))),                # has_having
+        float(bool(re.search(r"\blimit\b", sql, re.I))),       # has_limit
+        float(bool(re.search(r"\boffset\b", sql, re.I))),      # has_offset
+        float(bool(re.search(r"\bwhere\b", sql, re.I))),       # has_where
+        float(bool(LIKE_PATTERN.search(sql))),                  # has_like
+        float(bool(IN_PATTERN.search(sql))),                    # has_in_clause
+        float(bool(BETWEEN_PATTERN.search(sql))),               # has_between
+        float(bool(EXISTS_PATTERN.search(sql))),                # has_exists
+        float(bool(WINDOW_FUNCS.search(sql))),                  # has_window_func
+        float(bool(CTE_PATTERN.search(sql))),                   # has_cte
+        float(bool(UNION_PATTERN.search(sql))),                 # has_union
+        float(bool(CASE_PATTERN.search(sql))),                  # has_case
+        float(bool(CAST_PATTERN.search(sql))),                  # has_cast
+        float(_nesting_depth(sql)),                             # nesting_depth
+        float(num_conditions),                                  # num_and_or
+        float(num_string_lits),                                 # num_string_literals
+        float(num_numeric_lits),                                # num_numeric_literals
+    ]
+    return features

predict.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Inference API for pg_plan_cache models.
+Loads trained models and provides prediction functions for:
+  1. Cache benefit   (high / medium / low)
+  2. Recommended TTL (seconds)
+  3. Complexity score (1-100)
+"""
+import os
+import json
+import joblib
+import numpy as np
+from features import extract_features, FEATURE_NAMES
+MODEL_DIR = os.path.join(os.path.dirname(__file__), "trained")
+_cache_advisor = None
+_ttl_recommender = None
+_complexity_estimator = None
+_label_encoder = None
+_loaded = False
+def _load_models():
+    """Lazy-load all models from disk."""
+    global _cache_advisor, _ttl_recommender, _complexity_estimator, _label_encoder, _loaded
+    if _loaded:
+        return
+    _cache_advisor = joblib.load(os.path.join(MODEL_DIR, "cache_advisor.joblib"))
+    _ttl_recommender = joblib.load(os.path.join(MODEL_DIR, "ttl_recommender.joblib"))
+    _complexity_estimator = joblib.load(os.path.join(MODEL_DIR, "complexity_estimator.joblib"))
+    _label_encoder = joblib.load(os.path.join(MODEL_DIR, "label_encoder.joblib"))
+    _loaded = True
+def predict(sql: str) -> dict:
+    """
+    Run all three models on a SQL query.
+    Returns:
+        {
+            "query": str,
+            "cache_benefit": "high" | "medium" | "low",
+            "cache_benefit_probabilities": {"high": 0.8, "medium": 0.15, "low": 0.05},
+            "recommended_ttl": int,          # seconds
+            "ttl_human": str,                # e.g. "1h 0m"
+            "complexity_score": int,          # 1-100
+            "complexity_label": str,          # "simple" | "moderate" | "complex" | "very complex"
+            "features": {name: value, ...},
+        }
+    """
+    _load_models()
+    features = extract_features(sql)
+    X = np.array([features])
+    # Cache advisor
+    benefit_idx = _cache_advisor.predict(X)[0]
+    benefit_label = _label_encoder.inverse_transform([benefit_idx])[0]
+    benefit_probs = _cache_advisor.predict_proba(X)[0]
+    prob_dict = {
+        _label_encoder.inverse_transform([i])[0]: round(float(p), 4)
+        for i, p in enumerate(benefit_probs)
+    }
+    # TTL recommender
+    ttl_raw = _ttl_recommender.predict(X)[0]
+    ttl = max(0, int(round(ttl_raw)))
+    hours, mins = divmod(ttl // 60, 60)
+    ttl_human = f"{hours}h {mins}m" if hours else f"{mins}m"
+    # Complexity estimator
+    cplx_raw = _complexity_estimator.predict(X)[0]
+    cplx = max(1, min(100, int(round(cplx_raw))))
+    if cplx <= 20:
+        cplx_label = "simple"
+    elif cplx <= 45:
+        cplx_label = "moderate"
+    elif cplx <= 75:
+        cplx_label = "complex"
+    else:
+        cplx_label = "very complex"
+    return {
+        "query": sql,
+        "cache_benefit": benefit_label,
+        "cache_benefit_probabilities": prob_dict,
+        "recommended_ttl": ttl,
+        "ttl_human": ttl_human,
+        "complexity_score": cplx,
+        "complexity_label": cplx_label,
+        "features": dict(zip(FEATURE_NAMES, features)),
+    }
+def predict_batch(queries: list[str]) -> list[dict]:
+    """Run predictions on multiple queries."""
+    return [predict(q) for q in queries]
+def format_prediction(result: dict) -> str:
+    """Format a prediction result as a readable string."""
+    lines = [
+        f"  Query:       {result['query'][:100]}{'...' if len(result['query']) > 100 else ''}",
+        f"  Cache Benefit: {result['cache_benefit'].upper()}",
+        f"    Probabilities: {result['cache_benefit_probabilities']}",
+        f"  Recommended TTL: {result['recommended_ttl']}s ({result['ttl_human']})",
+        f"  Complexity:  {result['complexity_score']}/100 ({result['complexity_label']})",
+    ]
+    return "\n".join(lines)
+def get_model_info() -> dict:
+    """Return model metadata."""
+    meta_path = os.path.join(MODEL_DIR, "metadata.json")
+    if os.path.exists(meta_path):
+        with open(meta_path) as f:
+            return json.load(f)
+    return {"error": "metadata.json not found. Run train.py first."}
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python predict.py \"SELECT * FROM users WHERE id = 42\"")
+        sys.exit(1)
+    sql = " ".join(sys.argv[1:])
+    result = predict(sql)
+    print(format_prediction(result))

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+scikit-learn>=1.4.0
+joblib>=1.3.0
+numpy>=1.26.0
+huggingface_hub>=0.24.0

train.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python3
+"""
+Train all three pg_plan_cache models:
+  1. SQL Cache Advisor        (classification: low / medium / high)
+  2. Cache TTL Recommender    (regression: seconds)
+  3. Query Complexity Estimator (regression: 1-100 score)
+Saves trained models as joblib files in the ./trained/ directory.
+"""
+import os
+import json
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.metrics import classification_report, mean_absolute_error, r2_score
+from sklearn.preprocessing import LabelEncoder
+import joblib
+from features import extract_features, FEATURE_NAMES
+from dataset import generate_dataset
+OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "trained")
+def train():
+    print("=" * 60)
+    print("  pg_plan_cache — Model Training")
+    print("=" * 60)
+    # ── Generate data ─────────────────────────────────────────
+    print("\n[1/5] Generating synthetic training data...")
+    queries, benefits, ttls, complexities = generate_dataset(n=8000, seed=42)
+    print(f"  Generated {len(queries)} samples")
+    # ── Extract features ──────────────────────────────────────
+    print("\n[2/5] Extracting features...")
+    X = np.array([extract_features(q) for q in queries])
+    print(f"  Feature matrix: {X.shape}")
+    # ── Encode labels ─────────────────────────────────────────
+    le = LabelEncoder()
+    y_benefit = le.fit_transform(benefits)  # low=1, medium=2, high=0
+    y_ttl = np.array(ttls, dtype=float)
+    y_complexity = np.array(complexities, dtype=float)
+    # ── Split ─────────────────────────────────────────────────
+    X_train, X_test, yb_train, yb_test, yt_train, yt_test, yc_train, yc_test = \
+        train_test_split(X, y_benefit, y_ttl, y_complexity, test_size=0.2, random_state=42)
+    print(f"  Train: {len(X_train)}, Test: {len(X_test)}")
+    # ── Model 1: Cache Advisor (classification) ───────────────
+    print("\n[3/5] Training SQL Cache Advisor...")
+    clf = RandomForestClassifier(
+        n_estimators=200,
+        max_depth=15,
+        min_samples_split=5,
+        min_samples_leaf=2,
+        random_state=42,
+        n_jobs=-1,
+    )
+    clf.fit(X_train, yb_train)
+    yb_pred = clf.predict(X_test)
+    print("\n  Classification Report:")
+    report = classification_report(yb_test, yb_pred, target_names=le.classes_)
+    print("  " + report.replace("\n", "\n  "))
+    cv_scores = cross_val_score(clf, X, y_benefit, cv=5, scoring="accuracy")
+    print(f"  Cross-val accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
+    # ── Model 2: TTL Recommender (regression) ─────────────────
+    print("\n[4/5] Training Cache TTL Recommender...")
+    reg_ttl = GradientBoostingRegressor(
+        n_estimators=200,
+        max_depth=8,
+        learning_rate=0.1,
+        min_samples_split=5,
+        random_state=42,
+    )
+    reg_ttl.fit(X_train, yt_train)
+    yt_pred = reg_ttl.predict(X_test)
+    mae_ttl = mean_absolute_error(yt_test, yt_pred)
+    r2_ttl = r2_score(yt_test, yt_pred)
+    print(f"  MAE: {mae_ttl:.1f} seconds")
+    print(f"  R2:  {r2_ttl:.3f}")
+    # ── Model 3: Complexity Estimator (regression) ────────────
+    print("\n[5/5] Training Query Complexity Estimator...")
+    reg_cplx = GradientBoostingRegressor(
+        n_estimators=200,
+        max_depth=8,
+        learning_rate=0.1,
+        min_samples_split=5,
+        random_state=42,
+    )
+    reg_cplx.fit(X_train, yc_train)
+    yc_pred = reg_cplx.predict(X_test)
+    mae_cplx = mean_absolute_error(yc_test, yc_pred)
+    r2_cplx = r2_score(yc_test, yc_pred)
+    print(f"  MAE: {mae_cplx:.1f} points")
+    print(f"  R2:  {r2_cplx:.3f}")
+    # ── Save models ───────────────────────────────────────────
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    joblib.dump(clf, os.path.join(OUTPUT_DIR, "cache_advisor.joblib"))
+    joblib.dump(reg_ttl, os.path.join(OUTPUT_DIR, "ttl_recommender.joblib"))
+    joblib.dump(reg_cplx, os.path.join(OUTPUT_DIR, "complexity_estimator.joblib"))
+    joblib.dump(le, os.path.join(OUTPUT_DIR, "label_encoder.joblib"))
+    # Feature importances
+    importances = {
+        "cache_advisor": dict(zip(FEATURE_NAMES, clf.feature_importances_.tolist())),
+        "ttl_recommender": dict(zip(FEATURE_NAMES, reg_ttl.feature_importances_.tolist())),
+        "complexity_estimator": dict(zip(FEATURE_NAMES, reg_cplx.feature_importances_.tolist())),
+    }
+    with open(os.path.join(OUTPUT_DIR, "feature_importances.json"), "w") as f:
+        json.dump(importances, f, indent=2)
+    # Model metadata
+    metadata = {
+        "models": {
+            "cache_advisor": {
+                "type": "RandomForestClassifier",
+                "task": "classification",
+                "classes": le.classes_.tolist(),
+                "accuracy_cv5": round(float(cv_scores.mean()), 4),
+            },
+            "ttl_recommender": {
+                "type": "GradientBoostingRegressor",
+                "task": "regression",
+                "unit": "seconds",
+                "mae": round(float(mae_ttl), 2),
+                "r2": round(float(r2_ttl), 4),
+            },
+            "complexity_estimator": {
+                "type": "GradientBoostingRegressor",
+                "task": "regression",
+                "unit": "score (1-100)",
+                "mae": round(float(mae_cplx), 2),
+                "r2": round(float(r2_cplx), 4),
+            },
+        },
+        "features": FEATURE_NAMES,
+        "n_features": len(FEATURE_NAMES),
+        "training_samples": len(queries),
+        "test_samples": len(X_test),
+    }
+    with open(os.path.join(OUTPUT_DIR, "metadata.json"), "w") as f:
+        json.dump(metadata, f, indent=2)
+    print(f"\n  Models saved to {OUTPUT_DIR}/")
+    print("  Files: cache_advisor.joblib, ttl_recommender.joblib,")
+    print("         complexity_estimator.joblib, label_encoder.joblib,")
+    print("         feature_importances.json, metadata.json")
+    print("\nDone.")
+if __name__ == "__main__":
+    train()

trained/cache_advisor.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e11ba948fd643d426b62362f7fd71e30ec90e4a1f1593b2606ae1e31b7b3b19f
+size 818001

trained/complexity_estimator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bd2a5edfce1496bc23a0686e6822ff3d583c884ad9922d9eed5f369ef0b064b
+size 3038236

trained/feature_importances.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "cache_advisor": {
+    "query_length": 0.19116243566746416,
+    "query_type": 0.02137394504176744,
+    "num_tables": 0.09222282366305111,
+    "num_joins": 0.0748793608388074,
+    "num_conditions": 0.00154549784133088,
+    "num_aggregates": 0.0618503755668228,
+    "num_subqueries": 0.05156804724205885,
+    "num_columns": 0.07578970828634744,
+    "has_distinct": 0.04377194157855687,
+    "has_order_by": 0.03645645249300166,
+    "has_group_by": 0.04425844049972725,
+    "has_having": 0.0022541480803507635,
+    "has_limit": 0.042062573427220216,
+    "has_offset": 0.0,
+    "has_where": 0.008477512665144578,
+    "has_like": 0.0,
+    "has_in_clause": 0.005441079955562388,
+    "has_between": 0.0,
+    "has_exists": 0.0009272674367364887,
+    "has_window_func": 0.010171898283664462,
+    "has_cte": 0.0017415634776680982,
+    "has_union": 0.021229522300210402,
+    "has_case": 0.010714231584388431,
+    "has_cast": 0.0,
+    "nesting_depth": 0.1651162458494366,
+    "num_and_or": 0.0018247999615881344,
+    "num_string_literals": 0.02825993434632328,
+    "num_numeric_literals": 0.006900193912770316
+  },
+  "ttl_recommender": {
+    "query_length": 0.49334167936522283,
+    "query_type": 0.011472503279799304,
+    "num_tables": 0.04121816512371646,
+    "num_joins": 0.05664091770080013,
+    "num_conditions": 2.6766564086239894e-05,
+    "num_aggregates": 0.08454674221524747,
+    "num_subqueries": 0.012819407143812049,
+    "num_columns": 0.003503947545486143,
+    "has_distinct": 0.0058846177923228245,
+    "has_order_by": 0.0030112892658353254,
+    "has_group_by": 0.11555986501253222,
+    "has_having": 0.0005654100636265899,
+    "has_limit": 0.020011249481941062,
+    "has_offset": 0.0,
+    "has_where": 0.0006198304413308254,
+    "has_like": 0.0,
+    "has_in_clause": 0.006723068906959933,
+    "has_between": 0.0,
+    "has_exists": 1.5939534844064166e-05,
+    "has_window_func": 0.0016085055032078448,
+    "has_cte": 2.3841716696771857e-05,
+    "has_union": 5.051873650507809e-05,
+    "has_case": 2.1925568628142657e-05,
+    "has_cast": 0.0,
+    "nesting_depth": 0.13173720022142668,
+    "num_and_or": 2.27992721191164e-05,
+    "num_string_literals": 0.005676787044969987,
+    "num_numeric_literals": 0.004897022498882968
+  },
+  "complexity_estimator": {
+    "query_length": 0.5344926759628151,
+    "query_type": 0.0015962377188123598,
+    "num_tables": 0.031559929024199504,
+    "num_joins": 0.02335110657414861,
+    "num_conditions": 5.757862902242119e-05,
+    "num_aggregates": 0.04750932601796666,
+    "num_subqueries": 0.008970394733974358,
+    "num_columns": 0.00588104652025957,
+    "has_distinct": 0.01062122091510926,
+    "has_order_by": 0.0024661023837127443,
+    "has_group_by": 0.061828695835283276,
+    "has_having": 0.00034502697726715757,
+    "has_limit": 0.020807067356268808,
+    "has_offset": 0.0,
+    "has_where": 0.0004570231775885458,
+    "has_like": 0.0,
+    "has_in_clause": 0.013672027252240813,
+    "has_between": 0.0,
+    "has_exists": 7.242098418966911e-05,
+    "has_window_func": 0.0009971635825058846,
+    "has_cte": 1.4790912091677233e-05,
+    "has_union": 0.006250913065401877,
+    "has_case": 1.6824403324258042e-05,
+    "has_cast": 0.0,
+    "nesting_depth": 0.22281668760327789,
+    "num_and_or": 7.125446039372882e-05,
+    "num_string_literals": 0.003162795018354366,
+    "num_numeric_literals": 0.0029816908917914696
+  }
+}

trained/label_encoder.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dca6130147e0c2d5e5b985a5abb3087d622fbe3da1e3e09ce3c5a79cc5fd15e8
+size 399

trained/metadata.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "models": {
+    "cache_advisor": {
+      "type": "RandomForestClassifier",
+      "task": "classification",
+      "classes": [
+        "high",
+        "low",
+        "medium"
+      ],
+      "accuracy_cv5": 1.0
+    },
+    "ttl_recommender": {
+      "type": "GradientBoostingRegressor",
+      "task": "regression",
+      "unit": "seconds",
+      "mae": 494.56,
+      "r2": 0.8994
+    },
+    "complexity_estimator": {
+      "type": "GradientBoostingRegressor",
+      "task": "regression",
+      "unit": "score (1-100)",
+      "mae": 5.57,
+      "r2": 0.9216
+    }
+  },
+  "features": [
+    "query_length",
+    "query_type",
+    "num_tables",
+    "num_joins",
+    "num_conditions",
+    "num_aggregates",
+    "num_subqueries",
+    "num_columns",
+    "has_distinct",
+    "has_order_by",
+    "has_group_by",
+    "has_having",
+    "has_limit",
+    "has_offset",
+    "has_where",
+    "has_like",
+    "has_in_clause",
+    "has_between",
+    "has_exists",
+    "has_window_func",
+    "has_cte",
+    "has_union",
+    "has_case",
+    "has_cast",
+    "nesting_depth",
+    "num_and_or",
+    "num_string_literals",
+    "num_numeric_literals"
+  ],
+  "n_features": 28,
+  "training_samples": 8000,
+  "test_samples": 1600
+}

trained/ttl_recommender.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ac8fbc0829aba31da6ff9ea299f512b63ed95c065cc2ae7a5779c7a110486aa
+size 3066316