nl2sql-bench / data_factory /templates.py
ritvik360's picture
Upload folder using huggingface_hub
a39d8ef verified
"""
data_factory/templates.py
==========================
Human-authored, execution-verified SQL templates across 4 domains Γ— 3 difficulty tiers.
CRITICAL DESIGN PRINCIPLE:
SQL is NEVER generated by an LLM in this pipeline.
Every SQL here was written by hand and verified by running it against
seeded SQLite data. Zero errors guaranteed.
Structure per entry:
{
"domain": str, # ecommerce | healthcare | finance | hr
"difficulty": str, # easy | medium | hard
"sql": str, # verified ground-truth SQL
"description": str, # one-line English summary (seed for NL generation)
"base_nl": str, # canonical natural-language question
"has_order": bool, # True β†’ comparison is order-sensitive
}
"""
from __future__ import annotations
from typing import TypedDict
class Template(TypedDict):
domain: str
difficulty: str
sql: str
description: str
base_nl: str
has_order: bool
# ─────────────────────────────────────────────────────────────────────────────
# DOMAIN: ECOMMERCE
# ─────────────────────────────────────────────────────────────────────────────
ECOMMERCE_TEMPLATES: list[Template] = [
# ── EASY ────────────────────────────────────────────────────────────────
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "List gold-tier customers sorted alphabetically with id, name, email, country",
"base_nl": "List all gold-tier customers ordered by name alphabetically. Return id, name, email, country.",
"sql": (
"SELECT id, name, email, country "
"FROM customers "
"WHERE tier = 'gold' "
"ORDER BY name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Products priced above $100, sorted by price descending",
"base_nl": "Show all products with a price above $100, sorted from highest to lowest price. Return id, name, price.",
"sql": (
"SELECT id, name, price "
"FROM products "
"WHERE price > 100 "
"ORDER BY price DESC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Delivered orders with total_amount > 200, sorted by amount descending",
"base_nl": "Find all delivered orders with a total amount greater than $200, sorted by total amount descending. Return id, customer_id, total_amount, created_at.",
"sql": (
"SELECT id, customer_id, total_amount, created_at "
"FROM orders "
"WHERE status = 'delivered' "
" AND total_amount > 200 "
"ORDER BY total_amount DESC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Top 5 most expensive products",
"base_nl": "Return the top 5 most expensive products. Return id, name, price.",
"sql": (
"SELECT id, name, price "
"FROM products "
"ORDER BY price DESC "
"LIMIT 5"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Distinct countries where customers come from, sorted alphabetically",
"base_nl": "List all distinct countries our customers come from, sorted alphabetically. Return country.",
"sql": (
"SELECT DISTINCT country "
"FROM customers "
"ORDER BY country ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": False,
"description": "Count total number of customers",
"base_nl": "How many customers do we have in total? Return a single column total_customers.",
"sql": "SELECT COUNT(*) AS total_customers FROM customers",
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Products with zero stock",
"base_nl": "List all out-of-stock products. Return id, name, stock_quantity.",
"sql": (
"SELECT id, name, stock_quantity "
"FROM products "
"WHERE stock_quantity = 0 "
"ORDER BY name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Customers from India sorted by name",
"base_nl": "Show all customers from India, sorted by name. Return id, name, email.",
"sql": (
"SELECT id, name, email "
"FROM customers "
"WHERE country = 'India' "
"ORDER BY name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": True,
"description": "Products in a price range of $20 to $100 sorted by price ascending",
"base_nl": "Which products are priced between $20 and $100? Sort by price ascending. Return id, name, price.",
"sql": (
"SELECT id, name, price "
"FROM products "
"WHERE price BETWEEN 20 AND 100 "
"ORDER BY price ASC"
),
},
{
"domain": "ecommerce", "difficulty": "easy", "has_order": False,
"description": "Count orders by status",
"base_nl": "How many orders are there for each status? Return status, order_count.",
"sql": (
"SELECT status, COUNT(*) AS order_count "
"FROM orders "
"GROUP BY status"
),
},
# ── MEDIUM ───────────────────────────────────────────────────────────────
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Order count per customer including those with zero orders, sorted by count desc",
"base_nl": "How many orders has each customer placed? Include customers with zero orders. Return customer_name, order_count, sorted by order_count descending then customer_name ascending.",
"sql": (
"SELECT c.name AS customer_name, COUNT(o.id) AS order_count "
"FROM customers c "
"LEFT JOIN orders o ON c.id = o.customer_id "
"GROUP BY c.id, c.name "
"ORDER BY order_count DESC, customer_name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Average product rating per category sorted descending",
"base_nl": "What is the average product rating per category? Only include categories with at least one review. Return category_name, avg_rating (rounded to 2 decimal places), sorted by avg_rating descending.",
"sql": (
"SELECT c.name AS category_name, "
" ROUND(AVG(r.rating), 2) AS avg_rating "
"FROM categories c "
"JOIN products p ON p.category_id = c.id "
"JOIN reviews r ON r.product_id = p.id "
"GROUP BY c.id, c.name "
"ORDER BY avg_rating DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Customers who spent more than $500 on delivered orders",
"base_nl": "Which customers have spent more than $500 total on delivered orders? Return customer_name, total_spent (rounded to 2 decimal places), sorted by total_spent descending.",
"sql": (
"SELECT c.name AS customer_name, "
" ROUND(SUM(o.total_amount), 2) AS total_spent "
"FROM customers c "
"JOIN orders o ON o.customer_id = c.id "
"WHERE o.status = 'delivered' "
"GROUP BY c.id, c.name "
"HAVING SUM(o.total_amount) > 500 "
"ORDER BY total_spent DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Total quantity sold per product sorted descending",
"base_nl": "Show the total quantity sold for each product that appears in at least one order. Return product_name, total_quantity_sold, sorted by total_quantity_sold descending.",
"sql": (
"SELECT p.name AS product_name, "
" SUM(oi.quantity) AS total_quantity_sold "
"FROM products p "
"JOIN order_items oi ON oi.product_id = p.id "
"GROUP BY p.id, p.name "
"ORDER BY total_quantity_sold DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Product count and average price per category sorted by count desc",
"base_nl": "For each category, show the number of products and their average price. Return category_name, product_count, avg_price (rounded to 2 decimal places), sorted by product_count descending.",
"sql": (
"SELECT cat.name AS category_name, "
" COUNT(p.id) AS product_count, "
" ROUND(AVG(p.price), 2) AS avg_price "
"FROM categories cat "
"JOIN products p ON p.category_id = cat.id "
"GROUP BY cat.id, cat.name "
"ORDER BY product_count DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Categories with more than 5 in-stock products sorted by count desc",
"base_nl": "Which categories have more than 5 products in stock (stock_quantity > 0)? Return category_name, in_stock_count, sorted by in_stock_count descending.",
"sql": (
"SELECT c.name AS category_name, "
" COUNT(p.id) AS in_stock_count "
"FROM categories c "
"JOIN products p ON p.category_id = c.id "
"WHERE p.stock_quantity > 0 "
"GROUP BY c.id, c.name "
"HAVING COUNT(p.id) > 5 "
"ORDER BY in_stock_count DESC"
),
},
{
"domain": "ecommerce", "difficulty": "medium", "has_order": True,
"description": "Total revenue per product from order items, sorted descending",
"base_nl": "What is the total revenue generated by each product from order items? Return product_name, total_revenue (rounded to 2 decimal places), sorted by total_revenue descending.",
"sql": (
"SELECT p.name AS product_name, "
" ROUND(SUM(oi.quantity * oi.unit_price), 2) AS total_revenue "
"FROM products p "
"JOIN order_items oi ON oi.product_id = p.id "
"GROUP BY p.id, p.name "
"ORDER BY total_revenue DESC"
),
},
# ── HARD ─────────────────────────────────────────────────────────────────
{
"domain": "ecommerce", "difficulty": "hard", "has_order": True,
"description": "Customer spending rank using DENSE_RANK on delivered orders",
"base_nl": "Rank customers by total spending on delivered orders using DENSE_RANK (rank 1 = highest spender). Return customer_name, total_spent (rounded to 2 decimal places), spending_rank, sorted by spending_rank ascending.",
"sql": (
"SELECT customer_name, total_spent, spending_rank "
"FROM ( "
" SELECT c.name AS customer_name, "
" ROUND(SUM(o.total_amount), 2) AS total_spent, "
" DENSE_RANK() OVER (ORDER BY SUM(o.total_amount) DESC) AS spending_rank "
" FROM customers c "
" JOIN orders o ON o.customer_id = c.id "
" WHERE o.status = 'delivered' "
" GROUP BY c.id, c.name "
") sub "
"ORDER BY spending_rank ASC"
),
},
{
"domain": "ecommerce", "difficulty": "hard", "has_order": True,
"description": "Monthly delivered revenue with running total using window SUM",
"base_nl": "Show the monthly revenue from delivered orders and its running cumulative total. Return month (YYYY-MM), monthly_revenue, running_total (both rounded to 2 decimal places), sorted by month ascending.",
"sql": (
"WITH monthly AS ( "
" SELECT strftime('%Y-%m', created_at) AS month, "
" ROUND(SUM(total_amount), 2) AS monthly_revenue "
" FROM orders "
" WHERE status = 'delivered' "
" GROUP BY strftime('%Y-%m', created_at) "
") "
"SELECT month, "
" monthly_revenue, "
" ROUND(SUM(monthly_revenue) OVER (ORDER BY month), 2) AS running_total "
"FROM monthly "
"ORDER BY month ASC"
),
},
{
"domain": "ecommerce", "difficulty": "hard", "has_order": True,
"description": "Customers whose most recent order was cancelled, using ROW_NUMBER CTE",
"base_nl": "Find all customers whose most recent order has status 'cancelled'. Use ROW_NUMBER to identify the latest order per customer. Return customer_name, last_order_status, last_order_date, sorted by customer_name ascending.",
"sql": (
"WITH ranked_orders AS ( "
" SELECT customer_id, status, created_at, "
" ROW_NUMBER() OVER (PARTITION BY customer_id "
" ORDER BY created_at DESC) AS rn "
" FROM orders "
") "
"SELECT c.name AS customer_name, "
" ro.status AS last_order_status, "
" ro.created_at AS last_order_date "
"FROM customers c "
"JOIN ranked_orders ro ON ro.customer_id = c.id "
"WHERE ro.rn = 1 "
" AND ro.status = 'cancelled' "
"ORDER BY customer_name ASC"
),
},
{
"domain": "ecommerce", "difficulty": "hard", "has_order": True,
"description": "Products above their category average rating, using two CTEs",
"base_nl": "Find products whose average rating is strictly above the average rating of all products in their category. Return product_name, category_name, product_avg_rating, category_avg_rating (both rounded to 2 decimal places), sorted by product_avg_rating descending then product_name ascending.",
"sql": (
"WITH product_ratings AS ( "
" SELECT p.id AS product_id, p.name AS product_name, "
" p.category_id, c.name AS category_name, "
" ROUND(AVG(r.rating), 2) AS product_avg_rating "
" FROM products p "
" JOIN reviews r ON r.product_id = p.id "
" JOIN categories c ON c.id = p.category_id "
" GROUP BY p.id, p.name, p.category_id, c.name "
"), "
"category_ratings AS ( "
" SELECT category_id, "
" ROUND(AVG(product_avg_rating), 2) AS category_avg_rating "
" FROM product_ratings "
" GROUP BY category_id "
") "
"SELECT pr.product_name, pr.category_name, "
" pr.product_avg_rating, cr.category_avg_rating "
"FROM product_ratings pr "
"JOIN category_ratings cr ON cr.category_id = pr.category_id "
"WHERE pr.product_avg_rating > cr.category_avg_rating "
"ORDER BY pr.product_avg_rating DESC, pr.product_name ASC"
),
},
]
# ─────────────────────────────────────────────────────────────────────────────
# DOMAIN: HEALTHCARE
# ─────────────────────────────────────────────────────────────────────────────
HEALTHCARE_TEMPLATES: list[Template] = [
# ── EASY ────────────────────────────────────────────────────────────────
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Doctors sorted by consultation fee descending",
"base_nl": "List all doctors sorted by consultation fee from highest to lowest. Return id, name, specialization, consultation_fee.",
"sql": (
"SELECT id, name, specialization, consultation_fee "
"FROM doctors "
"ORDER BY consultation_fee DESC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Doctors with more than 10 years experience sorted desc",
"base_nl": "Show doctors with more than 10 years of experience, sorted by experience descending. Return id, name, specialization, experience_years.",
"sql": (
"SELECT id, name, specialization, experience_years "
"FROM doctors "
"WHERE experience_years > 10 "
"ORDER BY experience_years DESC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Patients from India sorted by name",
"base_nl": "List all patients from India sorted alphabetically by name. Return id, name, country, blood_type.",
"sql": (
"SELECT id, name, country, blood_type "
"FROM patients "
"WHERE country = 'India' "
"ORDER BY name ASC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Medications with unit price under $0.20 sorted ascending",
"base_nl": "Which medications cost less than $0.20 per unit? Sort by price ascending. Return id, name, category, unit_price.",
"sql": (
"SELECT id, name, category, unit_price "
"FROM medications "
"WHERE unit_price < 0.20 "
"ORDER BY unit_price ASC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Top 5 most expensive medications",
"base_nl": "What are the top 5 most expensive medications? Return id, name, unit_price.",
"sql": (
"SELECT id, name, unit_price "
"FROM medications "
"ORDER BY unit_price DESC "
"LIMIT 5"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": False,
"description": "Count of completed appointments",
"base_nl": "How many appointments have been completed? Return a single value total_completed.",
"sql": (
"SELECT COUNT(*) AS total_completed "
"FROM appointments "
"WHERE status = 'completed'"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": True,
"description": "Severe diagnoses sorted by ICD code",
"base_nl": "List all severe diagnoses sorted by ICD code. Return id, icd_code, description, severity.",
"sql": (
"SELECT id, icd_code, description, severity "
"FROM diagnoses "
"WHERE severity = 'severe' "
"ORDER BY icd_code ASC"
),
},
{
"domain": "healthcare", "difficulty": "easy", "has_order": False,
"description": "Count patients by gender",
"base_nl": "How many patients are there by gender? Return gender, patient_count.",
"sql": (
"SELECT gender, COUNT(*) AS patient_count "
"FROM patients "
"GROUP BY gender"
),
},
# ── MEDIUM ───────────────────────────────────────────────────────────────
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Appointment count per doctor including those with no appointments",
"base_nl": "How many appointments has each doctor had (including those with none)? Return doctor_name, appointment_count, sorted by appointment_count descending.",
"sql": (
"SELECT d.name AS doctor_name, COUNT(a.id) AS appointment_count "
"FROM doctors d "
"LEFT JOIN appointments a ON a.doctor_id = d.id "
"GROUP BY d.id, d.name "
"ORDER BY appointment_count DESC"
),
},
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Most prescribed medications by count",
"base_nl": "Which medications are prescribed most often? Return medication_name, category, times_prescribed, sorted by times_prescribed descending.",
"sql": (
"SELECT m.name AS medication_name, m.category, COUNT(p.id) AS times_prescribed "
"FROM medications m "
"JOIN prescriptions p ON p.medication_id = m.id "
"GROUP BY m.id, m.name, m.category "
"ORDER BY times_prescribed DESC"
),
},
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Patients with more than one completed visit",
"base_nl": "Which patients have had more than one completed appointment? Return patient_name, visit_count, sorted by visit_count descending.",
"sql": (
"SELECT pat.name AS patient_name, COUNT(DISTINCT a.id) AS visit_count "
"FROM patients pat "
"JOIN appointments a ON a.patient_id = pat.id "
"WHERE a.status = 'completed' "
"GROUP BY pat.id, pat.name "
"HAVING COUNT(DISTINCT a.id) > 1 "
"ORDER BY visit_count DESC"
),
},
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Estimated revenue per doctor from completed appointments",
"base_nl": "What is the estimated total revenue per doctor from completed appointments (based on consultation fee)? Return doctor_name, specialization, estimated_revenue (rounded to 2 decimal places), sorted by estimated_revenue descending.",
"sql": (
"SELECT d.name AS doctor_name, d.specialization, "
" ROUND(SUM(d.consultation_fee), 2) AS estimated_revenue "
"FROM doctors d "
"JOIN appointments a ON a.doctor_id = d.id "
"WHERE a.status = 'completed' "
"GROUP BY d.id, d.name, d.specialization "
"ORDER BY estimated_revenue DESC"
),
},
{
"domain": "healthcare", "difficulty": "medium", "has_order": True,
"description": "Diagnosis count per severity level",
"base_nl": "How many diagnoses are there per severity level? Return severity, diagnosis_count, sorted by diagnosis_count descending.",
"sql": (
"SELECT severity, COUNT(*) AS diagnosis_count "
"FROM diagnoses "
"GROUP BY severity "
"ORDER BY diagnosis_count DESC"
),
},
# ── HARD ─────────────────────────────────────────────────────────────────
{
"domain": "healthcare", "difficulty": "hard", "has_order": True,
"description": "Doctors ranked by appointment count within specialization using RANK",
"base_nl": "Rank doctors by appointment count within their specialization (rank 1 = most appointments). Return doctor_name, specialization, appointment_count, rank_in_spec, sorted by specialization then rank_in_spec ascending.",
"sql": (
"SELECT doctor_name, specialization, appointment_count, "
" RANK() OVER (PARTITION BY specialization ORDER BY appointment_count DESC) AS rank_in_spec "
"FROM ( "
" SELECT d.name AS doctor_name, d.specialization, COUNT(a.id) AS appointment_count "
" FROM doctors d "
" JOIN appointments a ON a.doctor_id = d.id "
" GROUP BY d.id, d.name, d.specialization "
") sub "
"ORDER BY specialization, rank_in_spec"
),
},
{
"domain": "healthcare", "difficulty": "hard", "has_order": True,
"description": "Top 10 patients by total completed visits using CTE",
"base_nl": "Find the top 10 patients by number of completed appointments. Return patient_name, total_visits, last_visit, sorted by total_visits descending.",
"sql": (
"WITH patient_visits AS ( "
" SELECT a.patient_id, COUNT(a.id) AS total_visits, "
" MAX(a.scheduled_at) AS last_visit "
" FROM appointments a "
" WHERE a.status = 'completed' "
" GROUP BY a.patient_id "
") "
"SELECT p.name AS patient_name, pv.total_visits, pv.last_visit "
"FROM patients p "
"JOIN patient_visits pv ON pv.patient_id = p.id "
"ORDER BY pv.total_visits DESC "
"LIMIT 10"
),
},
{
"domain": "healthcare", "difficulty": "hard", "has_order": True,
"description": "Medications total prescription cost per category using window SUM",
"base_nl": "For each medication, show its total prescription cost (unit_price Γ— quantity) and the running total of cost within its category. Return medication_name, category, total_cost, category_running_cost (both rounded to 2 decimal places), sorted by category then total_cost descending.",
"sql": (
"WITH med_costs AS ( "
" SELECT m.name AS medication_name, m.category, "
" ROUND(SUM(m.unit_price * pr.quantity), 2) AS total_cost "
" FROM medications m "
" JOIN prescriptions pr ON pr.medication_id = m.id "
" GROUP BY m.id, m.name, m.category "
") "
"SELECT medication_name, category, total_cost, "
" ROUND(SUM(total_cost) OVER (PARTITION BY category ORDER BY total_cost DESC), 2) "
" AS category_running_cost "
"FROM med_costs "
"ORDER BY category, total_cost DESC"
),
},
]
# ─────────────────────────────────────────────────────────────────────────────
# DOMAIN: FINANCE
# ─────────────────────────────────────────────────────────────────────────────
FINANCE_TEMPLATES: list[Template] = [
# ── EASY ────────────────────────────────────────────────────────────────
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Verified KYC customers sorted by name",
"base_nl": "List all customers with verified KYC status, sorted alphabetically. Return id, name, country, kyc_status.",
"sql": (
"SELECT id, name, country, kyc_status "
"FROM fin_customers "
"WHERE kyc_status = 'verified' "
"ORDER BY name ASC"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Accounts with balance over $10,000 sorted by balance descending",
"base_nl": "Which accounts have a balance greater than $10,000? Return id, customer_id, account_type, balance, sorted by balance descending.",
"sql": (
"SELECT id, customer_id, account_type, balance "
"FROM accounts "
"WHERE balance > 10000 "
"ORDER BY balance DESC"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Large credit transactions above $1,000 sorted by amount descending",
"base_nl": "Show all credit transactions with an amount greater than $1,000. Return id, account_id, txn_type, amount, created_at, sorted by amount descending.",
"sql": (
"SELECT id, account_id, txn_type, amount, created_at "
"FROM transactions "
"WHERE txn_type = 'credit' AND amount > 1000 "
"ORDER BY amount DESC"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Defaulted loans sorted by principal amount descending",
"base_nl": "List all defaulted loans, sorted by principal amount descending. Return id, loan_type, principal_amount, interest_rate, status.",
"sql": (
"SELECT id, loan_type, principal_amount, interest_rate, status "
"FROM loans "
"WHERE status = 'defaulted' "
"ORDER BY principal_amount DESC"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": False,
"description": "Count of late loan payments",
"base_nl": "How many loan payments were made late? Return a single value late_payments.",
"sql": "SELECT COUNT(*) AS late_payments FROM loan_payments WHERE is_late = 1",
},
{
"domain": "finance", "difficulty": "easy", "has_order": True,
"description": "Top 5 highest principal loans",
"base_nl": "What are the top 5 loans by principal amount? Return id, customer_id, loan_type, principal_amount.",
"sql": (
"SELECT id, customer_id, loan_type, principal_amount "
"FROM loans "
"ORDER BY principal_amount DESC "
"LIMIT 5"
),
},
{
"domain": "finance", "difficulty": "easy", "has_order": False,
"description": "Count of accounts by account type",
"base_nl": "How many accounts exist for each account type? Return account_type, account_count.",
"sql": (
"SELECT account_type, COUNT(*) AS account_count "
"FROM accounts "
"GROUP BY account_type"
),
},
# ── MEDIUM ───────────────────────────────────────────────────────────────
{
"domain": "finance", "difficulty": "medium", "has_order": True,
"description": "Total active account balance per customer sorted by balance descending",
"base_nl": "What is the total active account balance per customer? Return customer_name, account_count, total_balance (rounded to 2 decimal places), sorted by total_balance descending.",
"sql": (
"SELECT fc.name AS customer_name, COUNT(a.id) AS account_count, "
" ROUND(SUM(a.balance), 2) AS total_balance "
"FROM fin_customers fc "
"JOIN accounts a ON a.customer_id = fc.id "
"WHERE a.status = 'active' "
"GROUP BY fc.id, fc.name "
"ORDER BY total_balance DESC"
),
},
{
"domain": "finance", "difficulty": "medium", "has_order": True,
"description": "Total credit transaction amount by account type",
"base_nl": "What is the total credit amount per account type? Return account_type, total_credits (rounded to 2 decimal places), sorted by total_credits descending.",
"sql": (
"SELECT a.account_type, ROUND(SUM(t.amount), 2) AS total_credits "
"FROM accounts a "
"JOIN transactions t ON t.account_id = a.id "
"WHERE t.txn_type = 'credit' "
"GROUP BY a.account_type "
"ORDER BY total_credits DESC"
),
},
{
"domain": "finance", "difficulty": "medium", "has_order": True,
"description": "Total loan borrowing per customer sorted descending",
"base_nl": "How much has each customer borrowed in total across all loans? Return customer_name, loan_count, total_borrowed (rounded to 2 decimal places), sorted by total_borrowed descending.",
"sql": (
"SELECT fc.name AS customer_name, COUNT(l.id) AS loan_count, "
" ROUND(SUM(l.principal_amount), 2) AS total_borrowed "
"FROM fin_customers fc "
"JOIN loans l ON l.customer_id = fc.id "
"GROUP BY fc.id, fc.name "
"ORDER BY total_borrowed DESC"
),
},
{
"domain": "finance", "difficulty": "medium", "has_order": True,
"description": "Late payment count and total amount by loan type",
"base_nl": "For each loan type, how many late payments were there and what was the total amount paid late? Return loan_type, late_payments, total_late_paid (rounded to 2 decimal places), sorted by late_payments descending.",
"sql": (
"SELECT l.loan_type, COUNT(lp.id) AS late_payments, "
" ROUND(SUM(lp.amount_paid), 2) AS total_late_paid "
"FROM loans l "
"JOIN loan_payments lp ON lp.loan_id = l.id "
"WHERE lp.is_late = 1 "
"GROUP BY l.loan_type "
"ORDER BY late_payments DESC"
),
},
# ── HARD ─────────────────────────────────────────────────────────────────
{
"domain": "finance", "difficulty": "hard", "has_order": True,
"description": "Customer balance rank using DENSE_RANK on active accounts",
"base_nl": "Rank customers by their total active account balance using DENSE_RANK. Return customer_name, total_balance, balance_rank, sorted by balance_rank ascending.",
"sql": (
"SELECT customer_name, total_balance, "
" DENSE_RANK() OVER (ORDER BY total_balance DESC) AS balance_rank "
"FROM ( "
" SELECT fc.name AS customer_name, "
" ROUND(SUM(a.balance), 2) AS total_balance "
" FROM fin_customers fc "
" JOIN accounts a ON a.customer_id = fc.id "
" WHERE a.status = 'active' "
" GROUP BY fc.id, fc.name "
") sub "
"ORDER BY balance_rank"
),
},
{
"domain": "finance", "difficulty": "hard", "has_order": True,
"description": "Monthly transaction totals by type with running total using window SUM",
"base_nl": "Show monthly transaction totals per type (credit/debit) with a running cumulative total. Return month (YYYY-MM), txn_type, total, running_total (rounded to 2 decimal places), sorted by month then txn_type.",
"sql": (
"WITH monthly_txn AS ( "
" SELECT strftime('%Y-%m', created_at) AS month, "
" txn_type, "
" ROUND(SUM(amount), 2) AS total "
" FROM transactions "
" GROUP BY strftime('%Y-%m', created_at), txn_type "
") "
"SELECT month, txn_type, total, "
" ROUND(SUM(total) OVER (PARTITION BY txn_type ORDER BY month), 2) AS running_total "
"FROM monthly_txn "
"ORDER BY month, txn_type"
),
},
{
"domain": "finance", "difficulty": "hard", "has_order": True,
"description": "Customers with only defaulted loans using NOT EXISTS",
"base_nl": "Find customers who have at least one loan and ALL their loans are defaulted. Return customer_name, loan_count, sorted by customer_name ascending.",
"sql": (
"SELECT fc.name AS customer_name, COUNT(l.id) AS loan_count "
"FROM fin_customers fc "
"JOIN loans l ON l.customer_id = fc.id "
"GROUP BY fc.id, fc.name "
"HAVING COUNT(l.id) > 0 "
" AND SUM(CASE WHEN l.status != 'defaulted' THEN 1 ELSE 0 END) = 0 "
"ORDER BY customer_name ASC"
),
},
]
# ─────────────────────────────────────────────────────────────────────────────
# DOMAIN: HR
# ─────────────────────────────────────────────────────────────────────────────
HR_TEMPLATES: list[Template] = [
# ── EASY ────────────────────────────────────────────────────────────────
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Active employees sorted by salary descending",
"base_nl": "List all active employees sorted by salary from highest to lowest. Return id, name, job_title, salary.",
"sql": (
"SELECT id, name, job_title, salary "
"FROM employees "
"WHERE status = 'active' "
"ORDER BY salary DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Departments sorted by budget descending",
"base_nl": "Show all departments sorted by budget from largest to smallest. Return id, name, location, budget.",
"sql": (
"SELECT id, name, location, budget "
"FROM departments "
"ORDER BY budget DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Employees hired in 2023 or later sorted by hire date descending",
"base_nl": "Which employees were hired on or after January 1st 2023? Sort by hire date descending. Return id, name, job_title, hire_date.",
"sql": (
"SELECT id, name, job_title, hire_date "
"FROM employees "
"WHERE hire_date >= '2023-01-01' "
"ORDER BY hire_date DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Active projects sorted by budget descending",
"base_nl": "Show all currently active projects sorted by budget descending. Return id, name, status, budget.",
"sql": (
"SELECT id, name, status, budget "
"FROM projects "
"WHERE status = 'active' "
"ORDER BY budget DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Active employees earning above $100,000 sorted by salary descending",
"base_nl": "Which active employees earn more than $100,000? Return id, name, email, job_title, sorted by salary descending.",
"sql": (
"SELECT id, name, email, job_title "
"FROM employees "
"WHERE status = 'active' AND salary > 100000 "
"ORDER BY salary DESC"
),
},
{
"domain": "hr", "difficulty": "easy", "has_order": False,
"description": "Count of active employees",
"base_nl": "How many active employees do we currently have? Return active_employees.",
"sql": "SELECT COUNT(*) AS active_employees FROM employees WHERE status = 'active'",
},
{
"domain": "hr", "difficulty": "easy", "has_order": True,
"description": "Projects with no end date (ongoing) sorted by budget descending",
"base_nl": "List all ongoing projects that have no end date set. Return id, name, start_date, budget, sorted by budget descending.",
"sql": (
"SELECT id, name, start_date, budget "
"FROM projects "
"WHERE end_date IS NULL "
"ORDER BY budget DESC"
),
},
# ── MEDIUM ───────────────────────────────────────────────────────────────
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Headcount and average salary per department for active employees",
"base_nl": "For each department, what is the headcount and average salary of active employees? Return department_name, headcount, avg_salary (rounded to 2 decimal places), sorted by headcount descending.",
"sql": (
"SELECT d.name AS department_name, COUNT(e.id) AS headcount, "
" ROUND(AVG(e.salary), 2) AS avg_salary "
"FROM departments d "
"LEFT JOIN employees e ON e.department_id = d.id AND e.status = 'active' "
"GROUP BY d.id, d.name "
"ORDER BY headcount DESC"
),
},
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Average performance rating per employee sorted descending",
"base_nl": "What is the average performance review rating per active employee? Return employee_name, job_title, avg_rating (rounded to 2 decimal places), sorted by avg_rating descending.",
"sql": (
"SELECT e.name AS employee_name, e.job_title, "
" ROUND(AVG(pr.rating), 2) AS avg_rating "
"FROM employees e "
"JOIN performance_reviews pr ON pr.employee_id = e.id "
"WHERE e.status = 'active' "
"GROUP BY e.id, e.name, e.job_title "
"ORDER BY avg_rating DESC"
),
},
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Employees with the most total allocated project hours",
"base_nl": "Which employees have the most total hours allocated across projects? Return employee_name, total_hours, sorted by total_hours descending, top 10.",
"sql": (
"SELECT e.name AS employee_name, SUM(pa.hours_allocated) AS total_hours "
"FROM employees e "
"JOIN project_assignments pa ON pa.employee_id = e.id "
"GROUP BY e.id, e.name "
"ORDER BY total_hours DESC "
"LIMIT 10"
),
},
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Departments with distinct employees assigned to active projects",
"base_nl": "For each department, how many distinct employees are assigned to active projects? Return department_name, assigned_employees, sorted by assigned_employees descending.",
"sql": (
"SELECT d.name AS department_name, "
" COUNT(DISTINCT pa.employee_id) AS assigned_employees "
"FROM departments d "
"JOIN projects p ON p.department_id = d.id "
"JOIN project_assignments pa ON pa.project_id = p.id "
"WHERE p.status = 'active' "
"GROUP BY d.id, d.name "
"ORDER BY assigned_employees DESC"
),
},
{
"domain": "hr", "difficulty": "medium", "has_order": True,
"description": "Total project budget per department sorted descending",
"base_nl": "What is the total project budget per department? Return department_name, total_project_budget (rounded to 2 decimal places), sorted by total_project_budget descending.",
"sql": (
"SELECT d.name AS department_name, "
" ROUND(SUM(p.budget), 2) AS total_project_budget "
"FROM departments d "
"JOIN projects p ON p.department_id = d.id "
"GROUP BY d.id, d.name "
"ORDER BY total_project_budget DESC"
),
},
# ── HARD ─────────────────────────────────────────────────────────────────
{
"domain": "hr", "difficulty": "hard", "has_order": True,
"description": "Salary rank within department using DENSE_RANK",
"base_nl": "Rank active employees by salary within their department using DENSE_RANK (rank 1 = highest paid). Return employee_name, salary, department_name, salary_rank, sorted by department_name then salary_rank ascending.",
"sql": (
"SELECT employee_name, salary, department_name, "
" DENSE_RANK() OVER (PARTITION BY department_name ORDER BY salary DESC) AS salary_rank "
"FROM ( "
" SELECT e.name AS employee_name, e.salary, d.name AS department_name "
" FROM employees e "
" JOIN departments d ON d.id = e.department_id "
" WHERE e.status = 'active' "
") sub "
"ORDER BY department_name, salary_rank"
),
},
{
"domain": "hr", "difficulty": "hard", "has_order": True,
"description": "Employee performance band classification using CASE with avg rating CTE",
"base_nl": "Classify active employees into performance bands (High Performer: avg rating >= 4, Average: >= 3, Needs Improvement: < 3) based on their average review rating. Return employee_name, salary, avg_rating, performance_band, sorted by avg_rating descending.",
"sql": (
"WITH avg_ratings AS ( "
" SELECT employee_id, ROUND(AVG(rating), 2) AS avg_rating "
" FROM performance_reviews "
" GROUP BY employee_id "
") "
"SELECT e.name AS employee_name, e.salary, ar.avg_rating, "
" CASE WHEN ar.avg_rating >= 4 THEN 'High Performer' "
" WHEN ar.avg_rating >= 3 THEN 'Average' "
" ELSE 'Needs Improvement' "
" END AS performance_band "
"FROM employees e "
"JOIN avg_ratings ar ON ar.employee_id = e.id "
"WHERE e.status = 'active' "
"ORDER BY ar.avg_rating DESC"
),
},
{
"domain": "hr", "difficulty": "hard", "has_order": True,
"description": "Employees above their department average salary using CTE",
"base_nl": "Find active employees whose salary is above their department's average. Return employee_name, department_name, salary, dept_avg_salary (rounded to 2 decimal places), sorted by salary descending.",
"sql": (
"WITH dept_avg AS ( "
" SELECT department_id, ROUND(AVG(salary), 2) AS dept_avg_salary "
" FROM employees "
" WHERE status = 'active' "
" GROUP BY department_id "
") "
"SELECT e.name AS employee_name, d.name AS department_name, "
" e.salary, da.dept_avg_salary "
"FROM employees e "
"JOIN departments d ON d.id = e.department_id "
"JOIN dept_avg da ON da.department_id = e.department_id "
"WHERE e.status = 'active' AND e.salary > da.dept_avg_salary "
"ORDER BY e.salary DESC"
),
},
]
# ─────────────────────────────────────────────────────────────────────────────
# MASTER TEMPLATE REGISTRY
# ─────────────────────────────────────────────────────────────────────────────
ALL_TEMPLATES: list[Template] = (
ECOMMERCE_TEMPLATES +
HEALTHCARE_TEMPLATES +
FINANCE_TEMPLATES +
HR_TEMPLATES
)
TEMPLATES_BY_DOMAIN: dict[str, list[Template]] = {
"ecommerce": ECOMMERCE_TEMPLATES,
"healthcare": HEALTHCARE_TEMPLATES,
"finance": FINANCE_TEMPLATES,
"hr": HR_TEMPLATES,
}
TEMPLATES_BY_DIFFICULTY: dict[str, list[Template]] = {
"easy": [t for t in ALL_TEMPLATES if t["difficulty"] == "easy"],
"medium": [t for t in ALL_TEMPLATES if t["difficulty"] == "medium"],
"hard": [t for t in ALL_TEMPLATES if t["difficulty"] == "hard"],
}
def template_stats() -> dict:
stats: dict = {"total": len(ALL_TEMPLATES), "by_domain": {}, "by_difficulty": {}}
for d in ["ecommerce","healthcare","finance","hr"]:
stats["by_domain"][d] = len(TEMPLATES_BY_DOMAIN[d])
for diff in ["easy","medium","hard"]:
stats["by_difficulty"][diff] = len(TEMPLATES_BY_DIFFICULTY[diff])
return stats