[ { "task_id": "medium_001", "difficulty": "medium", "query": "SELECT * FROM events ORDER BY created_at DESC;", "schema": { "events": { "id": "BIGINT PRIMARY KEY", "event_name": "VARCHAR(255)", "payload": "JSON", "created_at": "TIMESTAMP INDEX", "actor_id": "BIGINT", "metadata": "JSON" } }, "context": "Show the most recent events on an admin dashboard.", "ground_truth_issues": [ { "id": "medium_001_select_star", "category": "performance", "description": "SELECT * pulls a wide payload when the dashboard only needs a few columns.", "severity": 0.3, "fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;", "keywords": [ "select *", "wide table", "projection", "performance", "star", "all columns", "unnecessary columns", "column selection", "over-fetching", "wildcard" ] }, { "id": "medium_001_missing_limit", "category": "performance", "description": "The dashboard query is missing a LIMIT and can scan far more rows than necessary.", "severity": 0.3, "fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;", "keywords": [ "limit", "unbounded query", "dashboard", "performance", "no limit", "missing limit", "unlimited rows", "pagination", "all rows", "full scan", "row count" ] } ], "max_steps": 5 }, { "task_id": "medium_002", "difficulty": "medium", "query": "SELECT c.id, c.name, (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.id) AS order_count FROM customers c;", "schema": { "customers": { "id": "INT PRIMARY KEY", "name": "VARCHAR(255)" }, "orders": { "id": "INT PRIMARY KEY", "customer_id": "INT INDEX", "total": "DECIMAL(10,2)" } }, "context": "Show each customer with the number of orders they have placed.", "ground_truth_issues": [ { "id": "medium_002_correlated_subquery", "category": "performance", "description": "The correlated subquery re-counts orders per row and should be rewritten as a join with GROUP BY.", "severity": 0.6, "fix": "SELECT c.id, c.name, COUNT(o.id) AS order_count FROM customers c LEFT JOIN orders o ON o.customer_id = c.id GROUP BY c.id, c.name;", "keywords": [ "correlated subquery", "group by", "join", "count", "performance", "subquery per row", "n+1", "rewrite", "left join", "aggregate", "scalar subquery", "dependent subquery" ] } ], "max_steps": 6 }, { "task_id": "medium_003", "difficulty": "medium", "query": "SELECT DISTINCT email FROM users WHERE email IS NOT NULL;", "schema": { "users": { "id": "INT PRIMARY KEY", "email": "VARCHAR(255) UNIQUE", "last_login_at": "TIMESTAMP NULL" } }, "context": "Export non-null user emails for a CRM sync.", "ground_truth_issues": [ { "id": "medium_003_redundant_distinct", "category": "performance", "description": "DISTINCT is redundant because users.email is already unique.", "severity": 0.45, "fix": "SELECT email FROM users WHERE email IS NOT NULL;", "keywords": [ "distinct", "unique", "redundant", "email", "performance", "unnecessary distinct", "unique constraint", "already unique", "duplicate elimination", "deduplication", "wasted sort" ] } ], "max_steps": 5 }, { "task_id": "medium_004", "difficulty": "medium", "query": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE DATE(o.created_at) = '2026-04-10';", "schema": { "orders": { "id": "INT PRIMARY KEY", "user_id": "INT INDEX", "created_at": "TIMESTAMP INDEX", "total": "DECIMAL(10,2)" }, "users": { "id": "INT PRIMARY KEY", "name": "VARCHAR(255)" } }, "context": "List orders placed on a specific date with the user name attached.", "ground_truth_issues": [ { "id": "medium_004_function_on_indexed_column", "category": "performance", "description": "Wrapping created_at with DATE() prevents efficient use of the created_at index.", "severity": 0.6, "fix": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE o.created_at >= '2026-04-10' AND o.created_at < '2026-04-11';", "keywords": [ "date()", "function on column", "index", "range predicate", "performance", "sargable", "non-sargable", "prevents index", "full scan", "index usage", "function wrapping" ] } ], "max_steps": 6 }, { "task_id": "medium_005", "difficulty": "medium", "query": "SELECT id, name FROM products WHERE LOWER(name) LIKE '%pro%';", "schema": { "products": { "id": "INT PRIMARY KEY", "name": "VARCHAR(255) INDEX", "category_id": "INT", "price": "DECIMAL(10,2)" } }, "context": "Search products whose names contain the text pro.", "ground_truth_issues": [ { "id": "medium_005_lower_blocks_index", "category": "performance", "description": "Applying LOWER(name) on every row prevents the index on name from being used efficiently.", "severity": 0.35, "fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';", "keywords": [ "lower", "function on column", "index", "performance", "sargable", "non-sargable", "case insensitive", "full scan", "table scan", "function wrapping column" ] }, { "id": "medium_005_leading_wildcard", "category": "performance", "description": "The leading wildcard in LIKE '%pro%' forces a full scan instead of an index-friendly prefix lookup.", "severity": 0.35, "fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';", "keywords": [ "leading wildcard", "%pro%", "full scan", "prefix lookup", "performance", "like wildcard", "pattern matching", "index unusable", "table scan", "wildcard prefix" ] } ], "max_steps": 6 }, { "task_id": "medium_006", "difficulty": "medium", "query": "SELECT * FROM events WHERE DATE(created_at) = '2024-01-15';", "schema": { "events": { "id": "INT PRIMARY KEY", "name": "VARCHAR(255)", "created_at": "TIMESTAMP", "INDEX": "idx_created_at ON events(created_at)" } }, "context": "Find all events that happened on a specific date.", "ground_truth_issues": [ { "id": "medium_006_function_on_index", "category": "performance", "description": "Using DATE() function on an indexed column prevents index usage. Use a range comparison instead.", "severity": 0.7, "fix": "SELECT * FROM events WHERE created_at >= '2024-01-15 00:00:00' AND created_at < '2024-01-16 00:00:00';", "keywords": [ "function on column", "date function", "index", "sargable", "non-sargable", "prevents index", "range comparison", "full scan", "table scan", "index usage", "function wrapping column" ] }, { "id": "medium_006_star", "category": "performance", "description": "SELECT * returns all columns when only specific fields may be needed.", "severity": 0.2, "fix": "SELECT id, name, created_at FROM events WHERE created_at >= '2024-01-15' AND created_at < '2024-01-16';", "keywords": [ "select *", "star", "all columns", "projection", "unnecessary columns", "wildcard", "over-fetching", "column selection" ] } ], "max_steps": 6 }, { "task_id": "medium_007", "difficulty": "medium", "query": "SELECT * FROM products ORDER BY RAND() LIMIT 10;", "schema": { "products": { "id": "INT PRIMARY KEY", "name": "VARCHAR(255)", "price": "DECIMAL(10,2)", "category": "VARCHAR(64)" } }, "context": "Show 10 random products on the homepage.", "ground_truth_issues": [ { "id": "medium_007_order_rand", "category": "performance", "description": "ORDER BY RAND() generates a random value for every row in the table, causing a full table scan and sort. Extremely slow on large tables.", "severity": 0.8, "fix": "SELECT * FROM products WHERE id >= (SELECT FLOOR(RAND() * (SELECT MAX(id) FROM products))) LIMIT 10;", "keywords": [ "order by rand", "random", "full table scan", "sort", "performance", "slow", "every row", "random ordering", "rand function", "expensive sort", "large table" ] } ], "max_steps": 5 } ]