Spaces:
Sleeping
Sleeping
| """ | |
| data_factory/templates.py | |
| ========================== | |
| Human-authored, execution-verified SQL templates across 4 domains Γ 3 difficulty tiers. | |
| CRITICAL DESIGN PRINCIPLE: | |
| SQL is NEVER generated by an LLM in this pipeline. | |
| Every SQL here was written by hand and verified by running it against | |
| seeded SQLite data. Zero errors guaranteed. | |
| Structure per entry: | |
| { | |
| "domain": str, # ecommerce | healthcare | finance | hr | |
| "difficulty": str, # easy | medium | hard | |
| "sql": str, # verified ground-truth SQL | |
| "description": str, # one-line English summary (seed for NL generation) | |
| "base_nl": str, # canonical natural-language question | |
| "has_order": bool, # True β comparison is order-sensitive | |
| } | |
| """ | |
| from __future__ import annotations | |
| from typing import TypedDict | |
| class Template(TypedDict): | |
| domain: str | |
| difficulty: str | |
| sql: str | |
| description: str | |
| base_nl: str | |
| has_order: bool | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # DOMAIN: ECOMMERCE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ECOMMERCE_TEMPLATES: list[Template] = [ | |
| # ββ EASY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": True, | |
| "description": "List gold-tier customers sorted alphabetically with id, name, email, country", | |
| "base_nl": "List all gold-tier customers ordered by name alphabetically. Return id, name, email, country.", | |
| "sql": ( | |
| "SELECT id, name, email, country " | |
| "FROM customers " | |
| "WHERE tier = 'gold' " | |
| "ORDER BY name ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": True, | |
| "description": "Products priced above $100, sorted by price descending", | |
| "base_nl": "Show all products with a price above $100, sorted from highest to lowest price. Return id, name, price.", | |
| "sql": ( | |
| "SELECT id, name, price " | |
| "FROM products " | |
| "WHERE price > 100 " | |
| "ORDER BY price DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": True, | |
| "description": "Delivered orders with total_amount > 200, sorted by amount descending", | |
| "base_nl": "Find all delivered orders with a total amount greater than $200, sorted by total amount descending. Return id, customer_id, total_amount, created_at.", | |
| "sql": ( | |
| "SELECT id, customer_id, total_amount, created_at " | |
| "FROM orders " | |
| "WHERE status = 'delivered' " | |
| " AND total_amount > 200 " | |
| "ORDER BY total_amount DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": True, | |
| "description": "Top 5 most expensive products", | |
| "base_nl": "Return the top 5 most expensive products. Return id, name, price.", | |
| "sql": ( | |
| "SELECT id, name, price " | |
| "FROM products " | |
| "ORDER BY price DESC " | |
| "LIMIT 5" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": True, | |
| "description": "Distinct countries where customers come from, sorted alphabetically", | |
| "base_nl": "List all distinct countries our customers come from, sorted alphabetically. Return country.", | |
| "sql": ( | |
| "SELECT DISTINCT country " | |
| "FROM customers " | |
| "ORDER BY country ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": False, | |
| "description": "Count total number of customers", | |
| "base_nl": "How many customers do we have in total? Return a single column total_customers.", | |
| "sql": "SELECT COUNT(*) AS total_customers FROM customers", | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": True, | |
| "description": "Products with zero stock", | |
| "base_nl": "List all out-of-stock products. Return id, name, stock_quantity.", | |
| "sql": ( | |
| "SELECT id, name, stock_quantity " | |
| "FROM products " | |
| "WHERE stock_quantity = 0 " | |
| "ORDER BY name ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": True, | |
| "description": "Customers from India sorted by name", | |
| "base_nl": "Show all customers from India, sorted by name. Return id, name, email.", | |
| "sql": ( | |
| "SELECT id, name, email " | |
| "FROM customers " | |
| "WHERE country = 'India' " | |
| "ORDER BY name ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": True, | |
| "description": "Products in a price range of $20 to $100 sorted by price ascending", | |
| "base_nl": "Which products are priced between $20 and $100? Sort by price ascending. Return id, name, price.", | |
| "sql": ( | |
| "SELECT id, name, price " | |
| "FROM products " | |
| "WHERE price BETWEEN 20 AND 100 " | |
| "ORDER BY price ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "easy", "has_order": False, | |
| "description": "Count orders by status", | |
| "base_nl": "How many orders are there for each status? Return status, order_count.", | |
| "sql": ( | |
| "SELECT status, COUNT(*) AS order_count " | |
| "FROM orders " | |
| "GROUP BY status" | |
| ), | |
| }, | |
| # ββ MEDIUM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "ecommerce", "difficulty": "medium", "has_order": True, | |
| "description": "Order count per customer including those with zero orders, sorted by count desc", | |
| "base_nl": "How many orders has each customer placed? Include customers with zero orders. Return customer_name, order_count, sorted by order_count descending then customer_name ascending.", | |
| "sql": ( | |
| "SELECT c.name AS customer_name, COUNT(o.id) AS order_count " | |
| "FROM customers c " | |
| "LEFT JOIN orders o ON c.id = o.customer_id " | |
| "GROUP BY c.id, c.name " | |
| "ORDER BY order_count DESC, customer_name ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "medium", "has_order": True, | |
| "description": "Average product rating per category sorted descending", | |
| "base_nl": "What is the average product rating per category? Only include categories with at least one review. Return category_name, avg_rating (rounded to 2 decimal places), sorted by avg_rating descending.", | |
| "sql": ( | |
| "SELECT c.name AS category_name, " | |
| " ROUND(AVG(r.rating), 2) AS avg_rating " | |
| "FROM categories c " | |
| "JOIN products p ON p.category_id = c.id " | |
| "JOIN reviews r ON r.product_id = p.id " | |
| "GROUP BY c.id, c.name " | |
| "ORDER BY avg_rating DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "medium", "has_order": True, | |
| "description": "Customers who spent more than $500 on delivered orders", | |
| "base_nl": "Which customers have spent more than $500 total on delivered orders? Return customer_name, total_spent (rounded to 2 decimal places), sorted by total_spent descending.", | |
| "sql": ( | |
| "SELECT c.name AS customer_name, " | |
| " ROUND(SUM(o.total_amount), 2) AS total_spent " | |
| "FROM customers c " | |
| "JOIN orders o ON o.customer_id = c.id " | |
| "WHERE o.status = 'delivered' " | |
| "GROUP BY c.id, c.name " | |
| "HAVING SUM(o.total_amount) > 500 " | |
| "ORDER BY total_spent DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "medium", "has_order": True, | |
| "description": "Total quantity sold per product sorted descending", | |
| "base_nl": "Show the total quantity sold for each product that appears in at least one order. Return product_name, total_quantity_sold, sorted by total_quantity_sold descending.", | |
| "sql": ( | |
| "SELECT p.name AS product_name, " | |
| " SUM(oi.quantity) AS total_quantity_sold " | |
| "FROM products p " | |
| "JOIN order_items oi ON oi.product_id = p.id " | |
| "GROUP BY p.id, p.name " | |
| "ORDER BY total_quantity_sold DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "medium", "has_order": True, | |
| "description": "Product count and average price per category sorted by count desc", | |
| "base_nl": "For each category, show the number of products and their average price. Return category_name, product_count, avg_price (rounded to 2 decimal places), sorted by product_count descending.", | |
| "sql": ( | |
| "SELECT cat.name AS category_name, " | |
| " COUNT(p.id) AS product_count, " | |
| " ROUND(AVG(p.price), 2) AS avg_price " | |
| "FROM categories cat " | |
| "JOIN products p ON p.category_id = cat.id " | |
| "GROUP BY cat.id, cat.name " | |
| "ORDER BY product_count DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "medium", "has_order": True, | |
| "description": "Categories with more than 5 in-stock products sorted by count desc", | |
| "base_nl": "Which categories have more than 5 products in stock (stock_quantity > 0)? Return category_name, in_stock_count, sorted by in_stock_count descending.", | |
| "sql": ( | |
| "SELECT c.name AS category_name, " | |
| " COUNT(p.id) AS in_stock_count " | |
| "FROM categories c " | |
| "JOIN products p ON p.category_id = c.id " | |
| "WHERE p.stock_quantity > 0 " | |
| "GROUP BY c.id, c.name " | |
| "HAVING COUNT(p.id) > 5 " | |
| "ORDER BY in_stock_count DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "medium", "has_order": True, | |
| "description": "Total revenue per product from order items, sorted descending", | |
| "base_nl": "What is the total revenue generated by each product from order items? Return product_name, total_revenue (rounded to 2 decimal places), sorted by total_revenue descending.", | |
| "sql": ( | |
| "SELECT p.name AS product_name, " | |
| " ROUND(SUM(oi.quantity * oi.unit_price), 2) AS total_revenue " | |
| "FROM products p " | |
| "JOIN order_items oi ON oi.product_id = p.id " | |
| "GROUP BY p.id, p.name " | |
| "ORDER BY total_revenue DESC" | |
| ), | |
| }, | |
| # ββ HARD βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "ecommerce", "difficulty": "hard", "has_order": True, | |
| "description": "Customer spending rank using DENSE_RANK on delivered orders", | |
| "base_nl": "Rank customers by total spending on delivered orders using DENSE_RANK (rank 1 = highest spender). Return customer_name, total_spent (rounded to 2 decimal places), spending_rank, sorted by spending_rank ascending.", | |
| "sql": ( | |
| "SELECT customer_name, total_spent, spending_rank " | |
| "FROM ( " | |
| " SELECT c.name AS customer_name, " | |
| " ROUND(SUM(o.total_amount), 2) AS total_spent, " | |
| " DENSE_RANK() OVER (ORDER BY SUM(o.total_amount) DESC) AS spending_rank " | |
| " FROM customers c " | |
| " JOIN orders o ON o.customer_id = c.id " | |
| " WHERE o.status = 'delivered' " | |
| " GROUP BY c.id, c.name " | |
| ") sub " | |
| "ORDER BY spending_rank ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "hard", "has_order": True, | |
| "description": "Monthly delivered revenue with running total using window SUM", | |
| "base_nl": "Show the monthly revenue from delivered orders and its running cumulative total. Return month (YYYY-MM), monthly_revenue, running_total (both rounded to 2 decimal places), sorted by month ascending.", | |
| "sql": ( | |
| "WITH monthly AS ( " | |
| " SELECT strftime('%Y-%m', created_at) AS month, " | |
| " ROUND(SUM(total_amount), 2) AS monthly_revenue " | |
| " FROM orders " | |
| " WHERE status = 'delivered' " | |
| " GROUP BY strftime('%Y-%m', created_at) " | |
| ") " | |
| "SELECT month, " | |
| " monthly_revenue, " | |
| " ROUND(SUM(monthly_revenue) OVER (ORDER BY month), 2) AS running_total " | |
| "FROM monthly " | |
| "ORDER BY month ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "hard", "has_order": True, | |
| "description": "Customers whose most recent order was cancelled, using ROW_NUMBER CTE", | |
| "base_nl": "Find all customers whose most recent order has status 'cancelled'. Use ROW_NUMBER to identify the latest order per customer. Return customer_name, last_order_status, last_order_date, sorted by customer_name ascending.", | |
| "sql": ( | |
| "WITH ranked_orders AS ( " | |
| " SELECT customer_id, status, created_at, " | |
| " ROW_NUMBER() OVER (PARTITION BY customer_id " | |
| " ORDER BY created_at DESC) AS rn " | |
| " FROM orders " | |
| ") " | |
| "SELECT c.name AS customer_name, " | |
| " ro.status AS last_order_status, " | |
| " ro.created_at AS last_order_date " | |
| "FROM customers c " | |
| "JOIN ranked_orders ro ON ro.customer_id = c.id " | |
| "WHERE ro.rn = 1 " | |
| " AND ro.status = 'cancelled' " | |
| "ORDER BY customer_name ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "ecommerce", "difficulty": "hard", "has_order": True, | |
| "description": "Products above their category average rating, using two CTEs", | |
| "base_nl": "Find products whose average rating is strictly above the average rating of all products in their category. Return product_name, category_name, product_avg_rating, category_avg_rating (both rounded to 2 decimal places), sorted by product_avg_rating descending then product_name ascending.", | |
| "sql": ( | |
| "WITH product_ratings AS ( " | |
| " SELECT p.id AS product_id, p.name AS product_name, " | |
| " p.category_id, c.name AS category_name, " | |
| " ROUND(AVG(r.rating), 2) AS product_avg_rating " | |
| " FROM products p " | |
| " JOIN reviews r ON r.product_id = p.id " | |
| " JOIN categories c ON c.id = p.category_id " | |
| " GROUP BY p.id, p.name, p.category_id, c.name " | |
| "), " | |
| "category_ratings AS ( " | |
| " SELECT category_id, " | |
| " ROUND(AVG(product_avg_rating), 2) AS category_avg_rating " | |
| " FROM product_ratings " | |
| " GROUP BY category_id " | |
| ") " | |
| "SELECT pr.product_name, pr.category_name, " | |
| " pr.product_avg_rating, cr.category_avg_rating " | |
| "FROM product_ratings pr " | |
| "JOIN category_ratings cr ON cr.category_id = pr.category_id " | |
| "WHERE pr.product_avg_rating > cr.category_avg_rating " | |
| "ORDER BY pr.product_avg_rating DESC, pr.product_name ASC" | |
| ), | |
| }, | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # DOMAIN: HEALTHCARE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| HEALTHCARE_TEMPLATES: list[Template] = [ | |
| # ββ EASY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "healthcare", "difficulty": "easy", "has_order": True, | |
| "description": "Doctors sorted by consultation fee descending", | |
| "base_nl": "List all doctors sorted by consultation fee from highest to lowest. Return id, name, specialization, consultation_fee.", | |
| "sql": ( | |
| "SELECT id, name, specialization, consultation_fee " | |
| "FROM doctors " | |
| "ORDER BY consultation_fee DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "easy", "has_order": True, | |
| "description": "Doctors with more than 10 years experience sorted desc", | |
| "base_nl": "Show doctors with more than 10 years of experience, sorted by experience descending. Return id, name, specialization, experience_years.", | |
| "sql": ( | |
| "SELECT id, name, specialization, experience_years " | |
| "FROM doctors " | |
| "WHERE experience_years > 10 " | |
| "ORDER BY experience_years DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "easy", "has_order": True, | |
| "description": "Patients from India sorted by name", | |
| "base_nl": "List all patients from India sorted alphabetically by name. Return id, name, country, blood_type.", | |
| "sql": ( | |
| "SELECT id, name, country, blood_type " | |
| "FROM patients " | |
| "WHERE country = 'India' " | |
| "ORDER BY name ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "easy", "has_order": True, | |
| "description": "Medications with unit price under $0.20 sorted ascending", | |
| "base_nl": "Which medications cost less than $0.20 per unit? Sort by price ascending. Return id, name, category, unit_price.", | |
| "sql": ( | |
| "SELECT id, name, category, unit_price " | |
| "FROM medications " | |
| "WHERE unit_price < 0.20 " | |
| "ORDER BY unit_price ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "easy", "has_order": True, | |
| "description": "Top 5 most expensive medications", | |
| "base_nl": "What are the top 5 most expensive medications? Return id, name, unit_price.", | |
| "sql": ( | |
| "SELECT id, name, unit_price " | |
| "FROM medications " | |
| "ORDER BY unit_price DESC " | |
| "LIMIT 5" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "easy", "has_order": False, | |
| "description": "Count of completed appointments", | |
| "base_nl": "How many appointments have been completed? Return a single value total_completed.", | |
| "sql": ( | |
| "SELECT COUNT(*) AS total_completed " | |
| "FROM appointments " | |
| "WHERE status = 'completed'" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "easy", "has_order": True, | |
| "description": "Severe diagnoses sorted by ICD code", | |
| "base_nl": "List all severe diagnoses sorted by ICD code. Return id, icd_code, description, severity.", | |
| "sql": ( | |
| "SELECT id, icd_code, description, severity " | |
| "FROM diagnoses " | |
| "WHERE severity = 'severe' " | |
| "ORDER BY icd_code ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "easy", "has_order": False, | |
| "description": "Count patients by gender", | |
| "base_nl": "How many patients are there by gender? Return gender, patient_count.", | |
| "sql": ( | |
| "SELECT gender, COUNT(*) AS patient_count " | |
| "FROM patients " | |
| "GROUP BY gender" | |
| ), | |
| }, | |
| # ββ MEDIUM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "healthcare", "difficulty": "medium", "has_order": True, | |
| "description": "Appointment count per doctor including those with no appointments", | |
| "base_nl": "How many appointments has each doctor had (including those with none)? Return doctor_name, appointment_count, sorted by appointment_count descending.", | |
| "sql": ( | |
| "SELECT d.name AS doctor_name, COUNT(a.id) AS appointment_count " | |
| "FROM doctors d " | |
| "LEFT JOIN appointments a ON a.doctor_id = d.id " | |
| "GROUP BY d.id, d.name " | |
| "ORDER BY appointment_count DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "medium", "has_order": True, | |
| "description": "Most prescribed medications by count", | |
| "base_nl": "Which medications are prescribed most often? Return medication_name, category, times_prescribed, sorted by times_prescribed descending.", | |
| "sql": ( | |
| "SELECT m.name AS medication_name, m.category, COUNT(p.id) AS times_prescribed " | |
| "FROM medications m " | |
| "JOIN prescriptions p ON p.medication_id = m.id " | |
| "GROUP BY m.id, m.name, m.category " | |
| "ORDER BY times_prescribed DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "medium", "has_order": True, | |
| "description": "Patients with more than one completed visit", | |
| "base_nl": "Which patients have had more than one completed appointment? Return patient_name, visit_count, sorted by visit_count descending.", | |
| "sql": ( | |
| "SELECT pat.name AS patient_name, COUNT(DISTINCT a.id) AS visit_count " | |
| "FROM patients pat " | |
| "JOIN appointments a ON a.patient_id = pat.id " | |
| "WHERE a.status = 'completed' " | |
| "GROUP BY pat.id, pat.name " | |
| "HAVING COUNT(DISTINCT a.id) > 1 " | |
| "ORDER BY visit_count DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "medium", "has_order": True, | |
| "description": "Estimated revenue per doctor from completed appointments", | |
| "base_nl": "What is the estimated total revenue per doctor from completed appointments (based on consultation fee)? Return doctor_name, specialization, estimated_revenue (rounded to 2 decimal places), sorted by estimated_revenue descending.", | |
| "sql": ( | |
| "SELECT d.name AS doctor_name, d.specialization, " | |
| " ROUND(SUM(d.consultation_fee), 2) AS estimated_revenue " | |
| "FROM doctors d " | |
| "JOIN appointments a ON a.doctor_id = d.id " | |
| "WHERE a.status = 'completed' " | |
| "GROUP BY d.id, d.name, d.specialization " | |
| "ORDER BY estimated_revenue DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "medium", "has_order": True, | |
| "description": "Diagnosis count per severity level", | |
| "base_nl": "How many diagnoses are there per severity level? Return severity, diagnosis_count, sorted by diagnosis_count descending.", | |
| "sql": ( | |
| "SELECT severity, COUNT(*) AS diagnosis_count " | |
| "FROM diagnoses " | |
| "GROUP BY severity " | |
| "ORDER BY diagnosis_count DESC" | |
| ), | |
| }, | |
| # ββ HARD βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "healthcare", "difficulty": "hard", "has_order": True, | |
| "description": "Doctors ranked by appointment count within specialization using RANK", | |
| "base_nl": "Rank doctors by appointment count within their specialization (rank 1 = most appointments). Return doctor_name, specialization, appointment_count, rank_in_spec, sorted by specialization then rank_in_spec ascending.", | |
| "sql": ( | |
| "SELECT doctor_name, specialization, appointment_count, " | |
| " RANK() OVER (PARTITION BY specialization ORDER BY appointment_count DESC) AS rank_in_spec " | |
| "FROM ( " | |
| " SELECT d.name AS doctor_name, d.specialization, COUNT(a.id) AS appointment_count " | |
| " FROM doctors d " | |
| " JOIN appointments a ON a.doctor_id = d.id " | |
| " GROUP BY d.id, d.name, d.specialization " | |
| ") sub " | |
| "ORDER BY specialization, rank_in_spec" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "hard", "has_order": True, | |
| "description": "Top 10 patients by total completed visits using CTE", | |
| "base_nl": "Find the top 10 patients by number of completed appointments. Return patient_name, total_visits, last_visit, sorted by total_visits descending.", | |
| "sql": ( | |
| "WITH patient_visits AS ( " | |
| " SELECT a.patient_id, COUNT(a.id) AS total_visits, " | |
| " MAX(a.scheduled_at) AS last_visit " | |
| " FROM appointments a " | |
| " WHERE a.status = 'completed' " | |
| " GROUP BY a.patient_id " | |
| ") " | |
| "SELECT p.name AS patient_name, pv.total_visits, pv.last_visit " | |
| "FROM patients p " | |
| "JOIN patient_visits pv ON pv.patient_id = p.id " | |
| "ORDER BY pv.total_visits DESC " | |
| "LIMIT 10" | |
| ), | |
| }, | |
| { | |
| "domain": "healthcare", "difficulty": "hard", "has_order": True, | |
| "description": "Medications total prescription cost per category using window SUM", | |
| "base_nl": "For each medication, show its total prescription cost (unit_price Γ quantity) and the running total of cost within its category. Return medication_name, category, total_cost, category_running_cost (both rounded to 2 decimal places), sorted by category then total_cost descending.", | |
| "sql": ( | |
| "WITH med_costs AS ( " | |
| " SELECT m.name AS medication_name, m.category, " | |
| " ROUND(SUM(m.unit_price * pr.quantity), 2) AS total_cost " | |
| " FROM medications m " | |
| " JOIN prescriptions pr ON pr.medication_id = m.id " | |
| " GROUP BY m.id, m.name, m.category " | |
| ") " | |
| "SELECT medication_name, category, total_cost, " | |
| " ROUND(SUM(total_cost) OVER (PARTITION BY category ORDER BY total_cost DESC), 2) " | |
| " AS category_running_cost " | |
| "FROM med_costs " | |
| "ORDER BY category, total_cost DESC" | |
| ), | |
| }, | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # DOMAIN: FINANCE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| FINANCE_TEMPLATES: list[Template] = [ | |
| # ββ EASY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "finance", "difficulty": "easy", "has_order": True, | |
| "description": "Verified KYC customers sorted by name", | |
| "base_nl": "List all customers with verified KYC status, sorted alphabetically. Return id, name, country, kyc_status.", | |
| "sql": ( | |
| "SELECT id, name, country, kyc_status " | |
| "FROM fin_customers " | |
| "WHERE kyc_status = 'verified' " | |
| "ORDER BY name ASC" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "easy", "has_order": True, | |
| "description": "Accounts with balance over $10,000 sorted by balance descending", | |
| "base_nl": "Which accounts have a balance greater than $10,000? Return id, customer_id, account_type, balance, sorted by balance descending.", | |
| "sql": ( | |
| "SELECT id, customer_id, account_type, balance " | |
| "FROM accounts " | |
| "WHERE balance > 10000 " | |
| "ORDER BY balance DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "easy", "has_order": True, | |
| "description": "Large credit transactions above $1,000 sorted by amount descending", | |
| "base_nl": "Show all credit transactions with an amount greater than $1,000. Return id, account_id, txn_type, amount, created_at, sorted by amount descending.", | |
| "sql": ( | |
| "SELECT id, account_id, txn_type, amount, created_at " | |
| "FROM transactions " | |
| "WHERE txn_type = 'credit' AND amount > 1000 " | |
| "ORDER BY amount DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "easy", "has_order": True, | |
| "description": "Defaulted loans sorted by principal amount descending", | |
| "base_nl": "List all defaulted loans, sorted by principal amount descending. Return id, loan_type, principal_amount, interest_rate, status.", | |
| "sql": ( | |
| "SELECT id, loan_type, principal_amount, interest_rate, status " | |
| "FROM loans " | |
| "WHERE status = 'defaulted' " | |
| "ORDER BY principal_amount DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "easy", "has_order": False, | |
| "description": "Count of late loan payments", | |
| "base_nl": "How many loan payments were made late? Return a single value late_payments.", | |
| "sql": "SELECT COUNT(*) AS late_payments FROM loan_payments WHERE is_late = 1", | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "easy", "has_order": True, | |
| "description": "Top 5 highest principal loans", | |
| "base_nl": "What are the top 5 loans by principal amount? Return id, customer_id, loan_type, principal_amount.", | |
| "sql": ( | |
| "SELECT id, customer_id, loan_type, principal_amount " | |
| "FROM loans " | |
| "ORDER BY principal_amount DESC " | |
| "LIMIT 5" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "easy", "has_order": False, | |
| "description": "Count of accounts by account type", | |
| "base_nl": "How many accounts exist for each account type? Return account_type, account_count.", | |
| "sql": ( | |
| "SELECT account_type, COUNT(*) AS account_count " | |
| "FROM accounts " | |
| "GROUP BY account_type" | |
| ), | |
| }, | |
| # ββ MEDIUM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "finance", "difficulty": "medium", "has_order": True, | |
| "description": "Total active account balance per customer sorted by balance descending", | |
| "base_nl": "What is the total active account balance per customer? Return customer_name, account_count, total_balance (rounded to 2 decimal places), sorted by total_balance descending.", | |
| "sql": ( | |
| "SELECT fc.name AS customer_name, COUNT(a.id) AS account_count, " | |
| " ROUND(SUM(a.balance), 2) AS total_balance " | |
| "FROM fin_customers fc " | |
| "JOIN accounts a ON a.customer_id = fc.id " | |
| "WHERE a.status = 'active' " | |
| "GROUP BY fc.id, fc.name " | |
| "ORDER BY total_balance DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "medium", "has_order": True, | |
| "description": "Total credit transaction amount by account type", | |
| "base_nl": "What is the total credit amount per account type? Return account_type, total_credits (rounded to 2 decimal places), sorted by total_credits descending.", | |
| "sql": ( | |
| "SELECT a.account_type, ROUND(SUM(t.amount), 2) AS total_credits " | |
| "FROM accounts a " | |
| "JOIN transactions t ON t.account_id = a.id " | |
| "WHERE t.txn_type = 'credit' " | |
| "GROUP BY a.account_type " | |
| "ORDER BY total_credits DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "medium", "has_order": True, | |
| "description": "Total loan borrowing per customer sorted descending", | |
| "base_nl": "How much has each customer borrowed in total across all loans? Return customer_name, loan_count, total_borrowed (rounded to 2 decimal places), sorted by total_borrowed descending.", | |
| "sql": ( | |
| "SELECT fc.name AS customer_name, COUNT(l.id) AS loan_count, " | |
| " ROUND(SUM(l.principal_amount), 2) AS total_borrowed " | |
| "FROM fin_customers fc " | |
| "JOIN loans l ON l.customer_id = fc.id " | |
| "GROUP BY fc.id, fc.name " | |
| "ORDER BY total_borrowed DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "medium", "has_order": True, | |
| "description": "Late payment count and total amount by loan type", | |
| "base_nl": "For each loan type, how many late payments were there and what was the total amount paid late? Return loan_type, late_payments, total_late_paid (rounded to 2 decimal places), sorted by late_payments descending.", | |
| "sql": ( | |
| "SELECT l.loan_type, COUNT(lp.id) AS late_payments, " | |
| " ROUND(SUM(lp.amount_paid), 2) AS total_late_paid " | |
| "FROM loans l " | |
| "JOIN loan_payments lp ON lp.loan_id = l.id " | |
| "WHERE lp.is_late = 1 " | |
| "GROUP BY l.loan_type " | |
| "ORDER BY late_payments DESC" | |
| ), | |
| }, | |
| # ββ HARD βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "finance", "difficulty": "hard", "has_order": True, | |
| "description": "Customer balance rank using DENSE_RANK on active accounts", | |
| "base_nl": "Rank customers by their total active account balance using DENSE_RANK. Return customer_name, total_balance, balance_rank, sorted by balance_rank ascending.", | |
| "sql": ( | |
| "SELECT customer_name, total_balance, " | |
| " DENSE_RANK() OVER (ORDER BY total_balance DESC) AS balance_rank " | |
| "FROM ( " | |
| " SELECT fc.name AS customer_name, " | |
| " ROUND(SUM(a.balance), 2) AS total_balance " | |
| " FROM fin_customers fc " | |
| " JOIN accounts a ON a.customer_id = fc.id " | |
| " WHERE a.status = 'active' " | |
| " GROUP BY fc.id, fc.name " | |
| ") sub " | |
| "ORDER BY balance_rank" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "hard", "has_order": True, | |
| "description": "Monthly transaction totals by type with running total using window SUM", | |
| "base_nl": "Show monthly transaction totals per type (credit/debit) with a running cumulative total. Return month (YYYY-MM), txn_type, total, running_total (rounded to 2 decimal places), sorted by month then txn_type.", | |
| "sql": ( | |
| "WITH monthly_txn AS ( " | |
| " SELECT strftime('%Y-%m', created_at) AS month, " | |
| " txn_type, " | |
| " ROUND(SUM(amount), 2) AS total " | |
| " FROM transactions " | |
| " GROUP BY strftime('%Y-%m', created_at), txn_type " | |
| ") " | |
| "SELECT month, txn_type, total, " | |
| " ROUND(SUM(total) OVER (PARTITION BY txn_type ORDER BY month), 2) AS running_total " | |
| "FROM monthly_txn " | |
| "ORDER BY month, txn_type" | |
| ), | |
| }, | |
| { | |
| "domain": "finance", "difficulty": "hard", "has_order": True, | |
| "description": "Customers with only defaulted loans using NOT EXISTS", | |
| "base_nl": "Find customers who have at least one loan and ALL their loans are defaulted. Return customer_name, loan_count, sorted by customer_name ascending.", | |
| "sql": ( | |
| "SELECT fc.name AS customer_name, COUNT(l.id) AS loan_count " | |
| "FROM fin_customers fc " | |
| "JOIN loans l ON l.customer_id = fc.id " | |
| "GROUP BY fc.id, fc.name " | |
| "HAVING COUNT(l.id) > 0 " | |
| " AND SUM(CASE WHEN l.status != 'defaulted' THEN 1 ELSE 0 END) = 0 " | |
| "ORDER BY customer_name ASC" | |
| ), | |
| }, | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # DOMAIN: HR | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| HR_TEMPLATES: list[Template] = [ | |
| # ββ EASY ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "hr", "difficulty": "easy", "has_order": True, | |
| "description": "Active employees sorted by salary descending", | |
| "base_nl": "List all active employees sorted by salary from highest to lowest. Return id, name, job_title, salary.", | |
| "sql": ( | |
| "SELECT id, name, job_title, salary " | |
| "FROM employees " | |
| "WHERE status = 'active' " | |
| "ORDER BY salary DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "easy", "has_order": True, | |
| "description": "Departments sorted by budget descending", | |
| "base_nl": "Show all departments sorted by budget from largest to smallest. Return id, name, location, budget.", | |
| "sql": ( | |
| "SELECT id, name, location, budget " | |
| "FROM departments " | |
| "ORDER BY budget DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "easy", "has_order": True, | |
| "description": "Employees hired in 2023 or later sorted by hire date descending", | |
| "base_nl": "Which employees were hired on or after January 1st 2023? Sort by hire date descending. Return id, name, job_title, hire_date.", | |
| "sql": ( | |
| "SELECT id, name, job_title, hire_date " | |
| "FROM employees " | |
| "WHERE hire_date >= '2023-01-01' " | |
| "ORDER BY hire_date DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "easy", "has_order": True, | |
| "description": "Active projects sorted by budget descending", | |
| "base_nl": "Show all currently active projects sorted by budget descending. Return id, name, status, budget.", | |
| "sql": ( | |
| "SELECT id, name, status, budget " | |
| "FROM projects " | |
| "WHERE status = 'active' " | |
| "ORDER BY budget DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "easy", "has_order": True, | |
| "description": "Active employees earning above $100,000 sorted by salary descending", | |
| "base_nl": "Which active employees earn more than $100,000? Return id, name, email, job_title, sorted by salary descending.", | |
| "sql": ( | |
| "SELECT id, name, email, job_title " | |
| "FROM employees " | |
| "WHERE status = 'active' AND salary > 100000 " | |
| "ORDER BY salary DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "easy", "has_order": False, | |
| "description": "Count of active employees", | |
| "base_nl": "How many active employees do we currently have? Return active_employees.", | |
| "sql": "SELECT COUNT(*) AS active_employees FROM employees WHERE status = 'active'", | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "easy", "has_order": True, | |
| "description": "Projects with no end date (ongoing) sorted by budget descending", | |
| "base_nl": "List all ongoing projects that have no end date set. Return id, name, start_date, budget, sorted by budget descending.", | |
| "sql": ( | |
| "SELECT id, name, start_date, budget " | |
| "FROM projects " | |
| "WHERE end_date IS NULL " | |
| "ORDER BY budget DESC" | |
| ), | |
| }, | |
| # ββ MEDIUM βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "hr", "difficulty": "medium", "has_order": True, | |
| "description": "Headcount and average salary per department for active employees", | |
| "base_nl": "For each department, what is the headcount and average salary of active employees? Return department_name, headcount, avg_salary (rounded to 2 decimal places), sorted by headcount descending.", | |
| "sql": ( | |
| "SELECT d.name AS department_name, COUNT(e.id) AS headcount, " | |
| " ROUND(AVG(e.salary), 2) AS avg_salary " | |
| "FROM departments d " | |
| "LEFT JOIN employees e ON e.department_id = d.id AND e.status = 'active' " | |
| "GROUP BY d.id, d.name " | |
| "ORDER BY headcount DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "medium", "has_order": True, | |
| "description": "Average performance rating per employee sorted descending", | |
| "base_nl": "What is the average performance review rating per active employee? Return employee_name, job_title, avg_rating (rounded to 2 decimal places), sorted by avg_rating descending.", | |
| "sql": ( | |
| "SELECT e.name AS employee_name, e.job_title, " | |
| " ROUND(AVG(pr.rating), 2) AS avg_rating " | |
| "FROM employees e " | |
| "JOIN performance_reviews pr ON pr.employee_id = e.id " | |
| "WHERE e.status = 'active' " | |
| "GROUP BY e.id, e.name, e.job_title " | |
| "ORDER BY avg_rating DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "medium", "has_order": True, | |
| "description": "Employees with the most total allocated project hours", | |
| "base_nl": "Which employees have the most total hours allocated across projects? Return employee_name, total_hours, sorted by total_hours descending, top 10.", | |
| "sql": ( | |
| "SELECT e.name AS employee_name, SUM(pa.hours_allocated) AS total_hours " | |
| "FROM employees e " | |
| "JOIN project_assignments pa ON pa.employee_id = e.id " | |
| "GROUP BY e.id, e.name " | |
| "ORDER BY total_hours DESC " | |
| "LIMIT 10" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "medium", "has_order": True, | |
| "description": "Departments with distinct employees assigned to active projects", | |
| "base_nl": "For each department, how many distinct employees are assigned to active projects? Return department_name, assigned_employees, sorted by assigned_employees descending.", | |
| "sql": ( | |
| "SELECT d.name AS department_name, " | |
| " COUNT(DISTINCT pa.employee_id) AS assigned_employees " | |
| "FROM departments d " | |
| "JOIN projects p ON p.department_id = d.id " | |
| "JOIN project_assignments pa ON pa.project_id = p.id " | |
| "WHERE p.status = 'active' " | |
| "GROUP BY d.id, d.name " | |
| "ORDER BY assigned_employees DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "medium", "has_order": True, | |
| "description": "Total project budget per department sorted descending", | |
| "base_nl": "What is the total project budget per department? Return department_name, total_project_budget (rounded to 2 decimal places), sorted by total_project_budget descending.", | |
| "sql": ( | |
| "SELECT d.name AS department_name, " | |
| " ROUND(SUM(p.budget), 2) AS total_project_budget " | |
| "FROM departments d " | |
| "JOIN projects p ON p.department_id = d.id " | |
| "GROUP BY d.id, d.name " | |
| "ORDER BY total_project_budget DESC" | |
| ), | |
| }, | |
| # ββ HARD βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "domain": "hr", "difficulty": "hard", "has_order": True, | |
| "description": "Salary rank within department using DENSE_RANK", | |
| "base_nl": "Rank active employees by salary within their department using DENSE_RANK (rank 1 = highest paid). Return employee_name, salary, department_name, salary_rank, sorted by department_name then salary_rank ascending.", | |
| "sql": ( | |
| "SELECT employee_name, salary, department_name, " | |
| " DENSE_RANK() OVER (PARTITION BY department_name ORDER BY salary DESC) AS salary_rank " | |
| "FROM ( " | |
| " SELECT e.name AS employee_name, e.salary, d.name AS department_name " | |
| " FROM employees e " | |
| " JOIN departments d ON d.id = e.department_id " | |
| " WHERE e.status = 'active' " | |
| ") sub " | |
| "ORDER BY department_name, salary_rank" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "hard", "has_order": True, | |
| "description": "Employee performance band classification using CASE with avg rating CTE", | |
| "base_nl": "Classify active employees into performance bands (High Performer: avg rating >= 4, Average: >= 3, Needs Improvement: < 3) based on their average review rating. Return employee_name, salary, avg_rating, performance_band, sorted by avg_rating descending.", | |
| "sql": ( | |
| "WITH avg_ratings AS ( " | |
| " SELECT employee_id, ROUND(AVG(rating), 2) AS avg_rating " | |
| " FROM performance_reviews " | |
| " GROUP BY employee_id " | |
| ") " | |
| "SELECT e.name AS employee_name, e.salary, ar.avg_rating, " | |
| " CASE WHEN ar.avg_rating >= 4 THEN 'High Performer' " | |
| " WHEN ar.avg_rating >= 3 THEN 'Average' " | |
| " ELSE 'Needs Improvement' " | |
| " END AS performance_band " | |
| "FROM employees e " | |
| "JOIN avg_ratings ar ON ar.employee_id = e.id " | |
| "WHERE e.status = 'active' " | |
| "ORDER BY ar.avg_rating DESC" | |
| ), | |
| }, | |
| { | |
| "domain": "hr", "difficulty": "hard", "has_order": True, | |
| "description": "Employees above their department average salary using CTE", | |
| "base_nl": "Find active employees whose salary is above their department's average. Return employee_name, department_name, salary, dept_avg_salary (rounded to 2 decimal places), sorted by salary descending.", | |
| "sql": ( | |
| "WITH dept_avg AS ( " | |
| " SELECT department_id, ROUND(AVG(salary), 2) AS dept_avg_salary " | |
| " FROM employees " | |
| " WHERE status = 'active' " | |
| " GROUP BY department_id " | |
| ") " | |
| "SELECT e.name AS employee_name, d.name AS department_name, " | |
| " e.salary, da.dept_avg_salary " | |
| "FROM employees e " | |
| "JOIN departments d ON d.id = e.department_id " | |
| "JOIN dept_avg da ON da.department_id = e.department_id " | |
| "WHERE e.status = 'active' AND e.salary > da.dept_avg_salary " | |
| "ORDER BY e.salary DESC" | |
| ), | |
| }, | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MASTER TEMPLATE REGISTRY | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ALL_TEMPLATES: list[Template] = ( | |
| ECOMMERCE_TEMPLATES + | |
| HEALTHCARE_TEMPLATES + | |
| FINANCE_TEMPLATES + | |
| HR_TEMPLATES | |
| ) | |
| TEMPLATES_BY_DOMAIN: dict[str, list[Template]] = { | |
| "ecommerce": ECOMMERCE_TEMPLATES, | |
| "healthcare": HEALTHCARE_TEMPLATES, | |
| "finance": FINANCE_TEMPLATES, | |
| "hr": HR_TEMPLATES, | |
| } | |
| TEMPLATES_BY_DIFFICULTY: dict[str, list[Template]] = { | |
| "easy": [t for t in ALL_TEMPLATES if t["difficulty"] == "easy"], | |
| "medium": [t for t in ALL_TEMPLATES if t["difficulty"] == "medium"], | |
| "hard": [t for t in ALL_TEMPLATES if t["difficulty"] == "hard"], | |
| } | |
| def template_stats() -> dict: | |
| stats: dict = {"total": len(ALL_TEMPLATES), "by_domain": {}, "by_difficulty": {}} | |
| for d in ["ecommerce","healthcare","finance","hr"]: | |
| stats["by_domain"][d] = len(TEMPLATES_BY_DOMAIN[d]) | |
| for diff in ["easy","medium","hard"]: | |
| stats["by_difficulty"][diff] = len(TEMPLATES_BY_DIFFICULTY[diff]) | |
| return stats | |