Spaces:
Sleeping
Sleeping
improve: 20 tasks, richer keywords, enhanced reward/grader, bigram matching, compelling README
b83c8ad | [ | |
| { | |
| "task_id": "medium_001", | |
| "difficulty": "medium", | |
| "query": "SELECT * FROM events ORDER BY created_at DESC;", | |
| "schema": { | |
| "events": { | |
| "id": "BIGINT PRIMARY KEY", | |
| "event_name": "VARCHAR(255)", | |
| "payload": "JSON", | |
| "created_at": "TIMESTAMP INDEX", | |
| "actor_id": "BIGINT", | |
| "metadata": "JSON" | |
| } | |
| }, | |
| "context": "Show the most recent events on an admin dashboard.", | |
| "ground_truth_issues": [ | |
| { | |
| "id": "medium_001_select_star", | |
| "category": "performance", | |
| "description": "SELECT * pulls a wide payload when the dashboard only needs a few columns.", | |
| "severity": 0.3, | |
| "fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;", | |
| "keywords": [ | |
| "select *", "wide table", "projection", "performance", "star", | |
| "all columns", "unnecessary columns", "column selection", | |
| "over-fetching", "wildcard" | |
| ] | |
| }, | |
| { | |
| "id": "medium_001_missing_limit", | |
| "category": "performance", | |
| "description": "The dashboard query is missing a LIMIT and can scan far more rows than necessary.", | |
| "severity": 0.3, | |
| "fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;", | |
| "keywords": [ | |
| "limit", "unbounded query", "dashboard", "performance", "no limit", | |
| "missing limit", "unlimited rows", "pagination", "all rows", | |
| "full scan", "row count" | |
| ] | |
| } | |
| ], | |
| "max_steps": 5 | |
| }, | |
| { | |
| "task_id": "medium_002", | |
| "difficulty": "medium", | |
| "query": "SELECT c.id, c.name, (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.id) AS order_count FROM customers c;", | |
| "schema": { | |
| "customers": { | |
| "id": "INT PRIMARY KEY", | |
| "name": "VARCHAR(255)" | |
| }, | |
| "orders": { | |
| "id": "INT PRIMARY KEY", | |
| "customer_id": "INT INDEX", | |
| "total": "DECIMAL(10,2)" | |
| } | |
| }, | |
| "context": "Show each customer with the number of orders they have placed.", | |
| "ground_truth_issues": [ | |
| { | |
| "id": "medium_002_correlated_subquery", | |
| "category": "performance", | |
| "description": "The correlated subquery re-counts orders per row and should be rewritten as a join with GROUP BY.", | |
| "severity": 0.6, | |
| "fix": "SELECT c.id, c.name, COUNT(o.id) AS order_count FROM customers c LEFT JOIN orders o ON o.customer_id = c.id GROUP BY c.id, c.name;", | |
| "keywords": [ | |
| "correlated subquery", "group by", "join", "count", "performance", | |
| "subquery per row", "n+1", "rewrite", "left join", "aggregate", | |
| "scalar subquery", "dependent subquery" | |
| ] | |
| } | |
| ], | |
| "max_steps": 6 | |
| }, | |
| { | |
| "task_id": "medium_003", | |
| "difficulty": "medium", | |
| "query": "SELECT DISTINCT email FROM users WHERE email IS NOT NULL;", | |
| "schema": { | |
| "users": { | |
| "id": "INT PRIMARY KEY", | |
| "email": "VARCHAR(255) UNIQUE", | |
| "last_login_at": "TIMESTAMP NULL" | |
| } | |
| }, | |
| "context": "Export non-null user emails for a CRM sync.", | |
| "ground_truth_issues": [ | |
| { | |
| "id": "medium_003_redundant_distinct", | |
| "category": "performance", | |
| "description": "DISTINCT is redundant because users.email is already unique.", | |
| "severity": 0.45, | |
| "fix": "SELECT email FROM users WHERE email IS NOT NULL;", | |
| "keywords": [ | |
| "distinct", "unique", "redundant", "email", "performance", | |
| "unnecessary distinct", "unique constraint", "already unique", | |
| "duplicate elimination", "deduplication", "wasted sort" | |
| ] | |
| } | |
| ], | |
| "max_steps": 5 | |
| }, | |
| { | |
| "task_id": "medium_004", | |
| "difficulty": "medium", | |
| "query": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE DATE(o.created_at) = '2026-04-10';", | |
| "schema": { | |
| "orders": { | |
| "id": "INT PRIMARY KEY", | |
| "user_id": "INT INDEX", | |
| "created_at": "TIMESTAMP INDEX", | |
| "total": "DECIMAL(10,2)" | |
| }, | |
| "users": { | |
| "id": "INT PRIMARY KEY", | |
| "name": "VARCHAR(255)" | |
| } | |
| }, | |
| "context": "List orders placed on a specific date with the user name attached.", | |
| "ground_truth_issues": [ | |
| { | |
| "id": "medium_004_function_on_indexed_column", | |
| "category": "performance", | |
| "description": "Wrapping created_at with DATE() prevents efficient use of the created_at index.", | |
| "severity": 0.6, | |
| "fix": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE o.created_at >= '2026-04-10' AND o.created_at < '2026-04-11';", | |
| "keywords": [ | |
| "date()", "function on column", "index", "range predicate", "performance", | |
| "sargable", "non-sargable", "prevents index", "full scan", | |
| "index usage", "function wrapping" | |
| ] | |
| } | |
| ], | |
| "max_steps": 6 | |
| }, | |
| { | |
| "task_id": "medium_005", | |
| "difficulty": "medium", | |
| "query": "SELECT id, name FROM products WHERE LOWER(name) LIKE '%pro%';", | |
| "schema": { | |
| "products": { | |
| "id": "INT PRIMARY KEY", | |
| "name": "VARCHAR(255) INDEX", | |
| "category_id": "INT", | |
| "price": "DECIMAL(10,2)" | |
| } | |
| }, | |
| "context": "Search products whose names contain the text pro.", | |
| "ground_truth_issues": [ | |
| { | |
| "id": "medium_005_lower_blocks_index", | |
| "category": "performance", | |
| "description": "Applying LOWER(name) on every row prevents the index on name from being used efficiently.", | |
| "severity": 0.35, | |
| "fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';", | |
| "keywords": [ | |
| "lower", "function on column", "index", "performance", "sargable", | |
| "non-sargable", "case insensitive", "full scan", "table scan", | |
| "function wrapping column" | |
| ] | |
| }, | |
| { | |
| "id": "medium_005_leading_wildcard", | |
| "category": "performance", | |
| "description": "The leading wildcard in LIKE '%pro%' forces a full scan instead of an index-friendly prefix lookup.", | |
| "severity": 0.35, | |
| "fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';", | |
| "keywords": [ | |
| "leading wildcard", "%pro%", "full scan", "prefix lookup", "performance", | |
| "like wildcard", "pattern matching", "index unusable", "table scan", | |
| "wildcard prefix" | |
| ] | |
| } | |
| ], | |
| "max_steps": 6 | |
| }, | |
| { | |
| "task_id": "medium_006", | |
| "difficulty": "medium", | |
| "query": "SELECT * FROM events WHERE DATE(created_at) = '2024-01-15';", | |
| "schema": { | |
| "events": { | |
| "id": "INT PRIMARY KEY", | |
| "name": "VARCHAR(255)", | |
| "created_at": "TIMESTAMP", | |
| "INDEX": "idx_created_at ON events(created_at)" | |
| } | |
| }, | |
| "context": "Find all events that happened on a specific date.", | |
| "ground_truth_issues": [ | |
| { | |
| "id": "medium_006_function_on_index", | |
| "category": "performance", | |
| "description": "Using DATE() function on an indexed column prevents index usage. Use a range comparison instead.", | |
| "severity": 0.7, | |
| "fix": "SELECT * FROM events WHERE created_at >= '2024-01-15 00:00:00' AND created_at < '2024-01-16 00:00:00';", | |
| "keywords": [ | |
| "function on column", "date function", "index", "sargable", | |
| "non-sargable", "prevents index", "range comparison", "full scan", | |
| "table scan", "index usage", "function wrapping column" | |
| ] | |
| }, | |
| { | |
| "id": "medium_006_star", | |
| "category": "performance", | |
| "description": "SELECT * returns all columns when only specific fields may be needed.", | |
| "severity": 0.2, | |
| "fix": "SELECT id, name, created_at FROM events WHERE created_at >= '2024-01-15' AND created_at < '2024-01-16';", | |
| "keywords": [ | |
| "select *", "star", "all columns", "projection", "unnecessary columns", | |
| "wildcard", "over-fetching", "column selection" | |
| ] | |
| } | |
| ], | |
| "max_steps": 6 | |
| }, | |
| { | |
| "task_id": "medium_007", | |
| "difficulty": "medium", | |
| "query": "SELECT * FROM products ORDER BY RAND() LIMIT 10;", | |
| "schema": { | |
| "products": { | |
| "id": "INT PRIMARY KEY", | |
| "name": "VARCHAR(255)", | |
| "price": "DECIMAL(10,2)", | |
| "category": "VARCHAR(64)" | |
| } | |
| }, | |
| "context": "Show 10 random products on the homepage.", | |
| "ground_truth_issues": [ | |
| { | |
| "id": "medium_007_order_rand", | |
| "category": "performance", | |
| "description": "ORDER BY RAND() generates a random value for every row in the table, causing a full table scan and sort. Extremely slow on large tables.", | |
| "severity": 0.8, | |
| "fix": "SELECT * FROM products WHERE id >= (SELECT FLOOR(RAND() * (SELECT MAX(id) FROM products))) LIMIT 10;", | |
| "keywords": [ | |
| "order by rand", "random", "full table scan", "sort", "performance", | |
| "slow", "every row", "random ordering", "rand function", | |
| "expensive sort", "large table" | |
| ] | |
| } | |
| ], | |
| "max_steps": 5 | |
| } | |
| ] | |