sql-query-reviewer / tasks /medium_tasks.json
hellinferno's picture
improve: 20 tasks, richer keywords, enhanced reward/grader, bigram matching, compelling README
b83c8ad
[
{
"task_id": "medium_001",
"difficulty": "medium",
"query": "SELECT * FROM events ORDER BY created_at DESC;",
"schema": {
"events": {
"id": "BIGINT PRIMARY KEY",
"event_name": "VARCHAR(255)",
"payload": "JSON",
"created_at": "TIMESTAMP INDEX",
"actor_id": "BIGINT",
"metadata": "JSON"
}
},
"context": "Show the most recent events on an admin dashboard.",
"ground_truth_issues": [
{
"id": "medium_001_select_star",
"category": "performance",
"description": "SELECT * pulls a wide payload when the dashboard only needs a few columns.",
"severity": 0.3,
"fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;",
"keywords": [
"select *", "wide table", "projection", "performance", "star",
"all columns", "unnecessary columns", "column selection",
"over-fetching", "wildcard"
]
},
{
"id": "medium_001_missing_limit",
"category": "performance",
"description": "The dashboard query is missing a LIMIT and can scan far more rows than necessary.",
"severity": 0.3,
"fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;",
"keywords": [
"limit", "unbounded query", "dashboard", "performance", "no limit",
"missing limit", "unlimited rows", "pagination", "all rows",
"full scan", "row count"
]
}
],
"max_steps": 5
},
{
"task_id": "medium_002",
"difficulty": "medium",
"query": "SELECT c.id, c.name, (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.id) AS order_count FROM customers c;",
"schema": {
"customers": {
"id": "INT PRIMARY KEY",
"name": "VARCHAR(255)"
},
"orders": {
"id": "INT PRIMARY KEY",
"customer_id": "INT INDEX",
"total": "DECIMAL(10,2)"
}
},
"context": "Show each customer with the number of orders they have placed.",
"ground_truth_issues": [
{
"id": "medium_002_correlated_subquery",
"category": "performance",
"description": "The correlated subquery re-counts orders per row and should be rewritten as a join with GROUP BY.",
"severity": 0.6,
"fix": "SELECT c.id, c.name, COUNT(o.id) AS order_count FROM customers c LEFT JOIN orders o ON o.customer_id = c.id GROUP BY c.id, c.name;",
"keywords": [
"correlated subquery", "group by", "join", "count", "performance",
"subquery per row", "n+1", "rewrite", "left join", "aggregate",
"scalar subquery", "dependent subquery"
]
}
],
"max_steps": 6
},
{
"task_id": "medium_003",
"difficulty": "medium",
"query": "SELECT DISTINCT email FROM users WHERE email IS NOT NULL;",
"schema": {
"users": {
"id": "INT PRIMARY KEY",
"email": "VARCHAR(255) UNIQUE",
"last_login_at": "TIMESTAMP NULL"
}
},
"context": "Export non-null user emails for a CRM sync.",
"ground_truth_issues": [
{
"id": "medium_003_redundant_distinct",
"category": "performance",
"description": "DISTINCT is redundant because users.email is already unique.",
"severity": 0.45,
"fix": "SELECT email FROM users WHERE email IS NOT NULL;",
"keywords": [
"distinct", "unique", "redundant", "email", "performance",
"unnecessary distinct", "unique constraint", "already unique",
"duplicate elimination", "deduplication", "wasted sort"
]
}
],
"max_steps": 5
},
{
"task_id": "medium_004",
"difficulty": "medium",
"query": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE DATE(o.created_at) = '2026-04-10';",
"schema": {
"orders": {
"id": "INT PRIMARY KEY",
"user_id": "INT INDEX",
"created_at": "TIMESTAMP INDEX",
"total": "DECIMAL(10,2)"
},
"users": {
"id": "INT PRIMARY KEY",
"name": "VARCHAR(255)"
}
},
"context": "List orders placed on a specific date with the user name attached.",
"ground_truth_issues": [
{
"id": "medium_004_function_on_indexed_column",
"category": "performance",
"description": "Wrapping created_at with DATE() prevents efficient use of the created_at index.",
"severity": 0.6,
"fix": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE o.created_at >= '2026-04-10' AND o.created_at < '2026-04-11';",
"keywords": [
"date()", "function on column", "index", "range predicate", "performance",
"sargable", "non-sargable", "prevents index", "full scan",
"index usage", "function wrapping"
]
}
],
"max_steps": 6
},
{
"task_id": "medium_005",
"difficulty": "medium",
"query": "SELECT id, name FROM products WHERE LOWER(name) LIKE '%pro%';",
"schema": {
"products": {
"id": "INT PRIMARY KEY",
"name": "VARCHAR(255) INDEX",
"category_id": "INT",
"price": "DECIMAL(10,2)"
}
},
"context": "Search products whose names contain the text pro.",
"ground_truth_issues": [
{
"id": "medium_005_lower_blocks_index",
"category": "performance",
"description": "Applying LOWER(name) on every row prevents the index on name from being used efficiently.",
"severity": 0.35,
"fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';",
"keywords": [
"lower", "function on column", "index", "performance", "sargable",
"non-sargable", "case insensitive", "full scan", "table scan",
"function wrapping column"
]
},
{
"id": "medium_005_leading_wildcard",
"category": "performance",
"description": "The leading wildcard in LIKE '%pro%' forces a full scan instead of an index-friendly prefix lookup.",
"severity": 0.35,
"fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';",
"keywords": [
"leading wildcard", "%pro%", "full scan", "prefix lookup", "performance",
"like wildcard", "pattern matching", "index unusable", "table scan",
"wildcard prefix"
]
}
],
"max_steps": 6
},
{
"task_id": "medium_006",
"difficulty": "medium",
"query": "SELECT * FROM events WHERE DATE(created_at) = '2024-01-15';",
"schema": {
"events": {
"id": "INT PRIMARY KEY",
"name": "VARCHAR(255)",
"created_at": "TIMESTAMP",
"INDEX": "idx_created_at ON events(created_at)"
}
},
"context": "Find all events that happened on a specific date.",
"ground_truth_issues": [
{
"id": "medium_006_function_on_index",
"category": "performance",
"description": "Using DATE() function on an indexed column prevents index usage. Use a range comparison instead.",
"severity": 0.7,
"fix": "SELECT * FROM events WHERE created_at >= '2024-01-15 00:00:00' AND created_at < '2024-01-16 00:00:00';",
"keywords": [
"function on column", "date function", "index", "sargable",
"non-sargable", "prevents index", "range comparison", "full scan",
"table scan", "index usage", "function wrapping column"
]
},
{
"id": "medium_006_star",
"category": "performance",
"description": "SELECT * returns all columns when only specific fields may be needed.",
"severity": 0.2,
"fix": "SELECT id, name, created_at FROM events WHERE created_at >= '2024-01-15' AND created_at < '2024-01-16';",
"keywords": [
"select *", "star", "all columns", "projection", "unnecessary columns",
"wildcard", "over-fetching", "column selection"
]
}
],
"max_steps": 6
},
{
"task_id": "medium_007",
"difficulty": "medium",
"query": "SELECT * FROM products ORDER BY RAND() LIMIT 10;",
"schema": {
"products": {
"id": "INT PRIMARY KEY",
"name": "VARCHAR(255)",
"price": "DECIMAL(10,2)",
"category": "VARCHAR(64)"
}
},
"context": "Show 10 random products on the homepage.",
"ground_truth_issues": [
{
"id": "medium_007_order_rand",
"category": "performance",
"description": "ORDER BY RAND() generates a random value for every row in the table, causing a full table scan and sort. Extremely slow on large tables.",
"severity": 0.8,
"fix": "SELECT * FROM products WHERE id >= (SELECT FLOOR(RAND() * (SELECT MAX(id) FROM products))) LIMIT 10;",
"keywords": [
"order by rand", "random", "full table scan", "sort", "performance",
"slow", "every row", "random ordering", "rand function",
"expensive sort", "large table"
]
}
],
"max_steps": 5
}
]