""" nl2sql-bench/server/tasks/hard.py =================================== Task 3 — Analytics & Window (difficulty: hard) Questions require CTEs, window functions (RANK, ROW_NUMBER, running totals), or non-trivial subqueries. Even strong frontier models often need 3–5 steps. """ from __future__ import annotations from .base import BaseTask, TaskExample, register @register class AnalyticsWindowTask(BaseTask): name = "analytics-window" difficulty = "hard" examples = [ TaskExample( question=( "Rank customers by their total spending on delivered orders " "using DENSE_RANK (rank 1 = highest spender). " "Return columns: customer_name, total_spent, spending_rank. " "Round total_spent to 2 decimal places. " "Sort by spending_rank ascending." ), sql=( "SELECT customer_name, total_spent, spending_rank " "FROM ( " " SELECT c.name AS customer_name, " " ROUND(SUM(o.total_amount), 2) AS total_spent, " " DENSE_RANK() OVER (ORDER BY SUM(o.total_amount) DESC) AS spending_rank " " FROM customers c " " JOIN orders o ON o.customer_id = c.id " " WHERE o.status = 'delivered' " " GROUP BY c.id, c.name " ") sub " "ORDER BY spending_rank ASC" ), notes="Window function DENSE_RANK inside a subquery wrapping a GROUP BY.", ), TaskExample( question=( "For each product that has been reviewed, show its name, its own " "average rating, and the average rating of all products in its category. " "Return columns: product_name, product_avg_rating, category_avg_rating. " "Round both averages to 2 decimal places. " "Sort by product_avg_rating descending." ), sql=( "SELECT p.name AS product_name, " " ROUND(AVG(r.rating), 2) AS product_avg_rating, " " ROUND(AVG(AVG(r.rating)) OVER (PARTITION BY p.category_id), 2) " " AS category_avg_rating " "FROM products p " "JOIN reviews r ON r.product_id = p.id " "GROUP BY p.id, p.name, p.category_id " "ORDER BY product_avg_rating DESC" ), notes="AVG of AVG via window PARTITION BY — requires nested aggregate understanding.", ), TaskExample( question=( "Find all customers whose most recent order has status 'cancelled'. " "Use a CTE with ROW_NUMBER to identify the latest order per customer. " "Return columns: customer_name, last_order_status, last_order_date. " "Sort by customer_name ascending." ), sql=( "WITH ranked_orders AS ( " " SELECT customer_id, status, created_at, " " ROW_NUMBER() OVER (PARTITION BY customer_id " " ORDER BY created_at DESC) AS rn " " FROM orders " ") " "SELECT c.name AS customer_name, " " ro.status AS last_order_status, " " ro.created_at AS last_order_date " "FROM customers c " "JOIN ranked_orders ro ON ro.customer_id = c.id " "WHERE ro.rn = 1 " " AND ro.status = 'cancelled' " "ORDER BY customer_name ASC" ), notes="CTE + ROW_NUMBER window partitioned by customer_id.", ), TaskExample( question=( "Show the monthly revenue from delivered orders and its running total, " "for all months in 2024. " "Return columns: month (format YYYY-MM), monthly_revenue, running_total. " "Round both revenue columns to 2 decimal places. " "Sort by month ascending." ), sql=( "WITH monthly AS ( " " SELECT strftime('%Y-%m', created_at) AS month, " " ROUND(SUM(total_amount), 2) AS monthly_revenue " " FROM orders " " WHERE status = 'delivered' " " AND created_at >= '2024-01-01' " " AND created_at < '2025-01-01' " " GROUP BY strftime('%Y-%m', created_at) " ") " "SELECT month, " " monthly_revenue, " " ROUND(SUM(monthly_revenue) OVER (ORDER BY month), 2) AS running_total " "FROM monthly " "ORDER BY month ASC" ), notes="CTE + cumulative SUM window ordered by month string.", ), TaskExample( question=( "Find products whose average rating is strictly above the average " "rating of all products in their category. " "Return columns: product_name, category_name, " "product_avg_rating, category_avg_rating. " "Round both averages to 2 decimal places. " "Sort by product_avg_rating descending, then product_name ascending." ), sql=( "WITH product_ratings AS ( " " SELECT p.id AS product_id, p.name AS product_name, " " p.category_id, c.name AS category_name, " " ROUND(AVG(r.rating), 2) AS product_avg_rating " " FROM products p " " JOIN reviews r ON r.product_id = p.id " " JOIN categories c ON c.id = p.category_id " " GROUP BY p.id, p.name, p.category_id, c.name " "), " "category_ratings AS ( " " SELECT category_id, " " ROUND(AVG(product_avg_rating), 2) AS category_avg_rating " " FROM product_ratings " " GROUP BY category_id " ") " "SELECT pr.product_name, pr.category_name, " " pr.product_avg_rating, cr.category_avg_rating " "FROM product_ratings pr " "JOIN category_ratings cr ON cr.category_id = pr.category_id " "WHERE pr.product_avg_rating > cr.category_avg_rating " "ORDER BY pr.product_avg_rating DESC, pr.product_name ASC" ), notes="Two CTEs, correlated comparison between product and category averages.", ), ] def description(self) -> str: return ( "Advanced analytics queries using CTEs, window functions " "(DENSE_RANK, ROW_NUMBER, running SUM), and nested subqueries. " "Tests multi-step reasoning and SQLite-specific syntax." )