""" Task definitions and automated graders for the DataDetective environment. Each task has: - id, title, difficulty, description - A grader function that scores the agent's final answer (0.0 - 1.0) based on whether key findings are mentioned. """ import re from typing import Callable def _has_any(text: str, keywords: list[str]) -> bool: """Case-insensitive check: does *text* contain any of *keywords*?""" low = text.lower() return any(kw.lower() in low for kw in keywords) def _has_pattern(text: str, pattern: str) -> bool: return bool(re.search(pattern, text, re.IGNORECASE)) def _grade_orders_drop(answer: str) -> float: score = 0.0 if _has_any(answer, ["drop", "decrease", "decline", "fell", "fewer", "reduction", "lower"]): score += 0.20 if _has_any(answer, ["spring mega sale", "spring sale", "mega sale"]) or ( _has_any(answer, ["promotion", "promo", "sale", "discount", "campaign"]) ): score += 0.20 if _has_any(answer, ["ended", "expired", "over", "concluded", "stopped"]) or _has_pattern( answer, r"march\s*0?1" ): score += 0.20 if _has_any(answer, [ "caused", "because", "due to", "result of", "led to", "when the", "after the", "ending of", "end of the", "correlated", "explains", ]): score += 0.20 if _has_pattern(answer, r"\d+\s*(orders|transactions)") or _has_pattern( answer, r"\d+\s*%" ) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"): score += 0.20 return min(score, 1.0) def _grade_returns_spike(answer: str) -> float: score = 0.0 if _has_any(answer, ["wireless headphones", "headphones pro", "headphone"]): score += 0.20 if _has_any(answer, ["west"]): score += 0.20 if _has_any(answer, ["audiotech", "audio tech"]): score += 0.20 if _has_any(answer, [ "defect", "defective", "faulty", "quality", "high return", "return rate", "abnormal", "stopped working", "battery issue", "poor audio", ]): score += 0.20 if _has_pattern(answer, r"\d+\s*%") or _has_pattern( answer, r"\d+\s*(returns|returned|units)" ) or _has_any(answer, ["return rate", "compared to"]): score += 0.20 return min(score, 1.0) def _grade_customer_churn(answer: str) -> float: score = 0.0 if _has_pattern(answer, r"\d+\s*%") or _has_any(answer, [ "decline", "decrease", "drop", "churn", "fewer active", "lost customers", "stopped ordering", ]): score += 0.20 if _has_any(answer, ["enterprise"]): score += 0.20 if _has_any(answer, ["northeast", "north east", "north-east"]): score += 0.20 if _has_any(answer, [ "price increase", "price change", "price hike", "pricing", "more expensive", "raised price", "cost increase", ]): score += 0.20 if _has_any(answer, [ "laptop pro", "desktop workstation", "office suite", "devtools", "external ssd", ]) or _has_pattern(answer, r"product.*(1|2|11|15|19)"): score += 0.20 return min(score, 1.0) def _grade_shipping_delay(answer: str) -> float: score = 0.0 if _has_any(answer, ["midwest"]): score += 0.20 if _has_any(answer, ["quickship", "quick ship"]): score += 0.20 if _has_any(answer, [ "delivery delay", "late delivery", "delayed shipment", "shipping delay", "late shipment", "delivery time", "delayed delivery", "slow delivery", ]): score += 0.20 if _has_pattern(answer, r"feb(ruary)?\s*(10|mid|middle)") or _has_any(answer, [ "mid-february", "mid february", "around february", "starting in february", "beginning of february", ]): score += 0.20 if _has_any(answer, [ "support ticket", "complaint", "ticket volume", "customer satisfaction", "support request", ]) and _has_any(answer, [ "delivery", "shipping", "carrier", "quickship", ]): score += 0.20 return min(score, 1.0) def _grade_revenue_paradox(answer: str) -> float: score = 0.0 if _has_any(answer, [ "spring mega sale", "mega sale", "25%", "25 percent", ]) or ( _has_any(answer, ["promotion", "promo", "discount", "sale"]) and _has_any(answer, ["margin", "profit", "cost"]) ): score += 0.20 if _has_any(answer, [ "product mix", "category mix", "mix shift", "shifted toward", "higher proportion", "more electronics", "low-margin", "composition changed", ]): score += 0.20 if _has_any(answer, ["enterprise"]) and _has_any(answer, [ "price increase", "price change", "price hike", "lost", "churn", "left", "fewer", "decline", ]): score += 0.20 if _has_any(answer, ["return", "refund"]) and _has_any(answer, [ "cost", "expense", "profit", "margin", "loss", "erode", ]): score += 0.20 if _has_pattern(answer, r"\$\s*[\d,]+") or _has_pattern( answer, r"\d+\s*%" ) or _has_pattern(answer, r"from\s+\$?[\d,]+.*to\s+\$?[\d,]+"): score += 0.20 return min(score, 1.0) def _grade_supplier_quality(answer: str) -> float: score = 0.0 if _has_any(answer, ["audiotech", "audio tech"]): score += 0.20 if _has_any(answer, ["wireless headphones", "headphones pro", "product 6"]): score += 0.20 if _has_any(answer, ["bluetooth speaker", "product 7"]): score += 0.20 if _has_any(answer, ["return rate", "refund", "return volume"]) or _has_pattern( answer, r"\d+\s*%.*return" ) or _has_pattern(answer, r"return.*\d+\s*%") or _has_pattern( answer, r"\$\s*[\d,]+" ): score += 0.20 if _has_any(answer, [ "support ticket", "defect", "complaint", "product_defect", "quality issue", "customer complaint", ]): score += 0.20 return min(score, 1.0) def _grade_inventory_stockout(answer: str) -> float: score = 0.0 if _has_any(answer, ["west"]): score += 0.20 if _has_any(answer, ["monitor", "product 4", "monitor 27"]): score += 0.20 if _has_any(answer, [ "inventory", "stock", "out of stock", "stockout", "stock-out", "zero units", "no inventory", "warehouse", ]): score += 0.20 if _has_any(answer, [ "spring mega sale", "mega sale", "promo", "promotion", "february 15", "feb 15", "during the sale", ]): score += 0.20 if _has_pattern(answer, r"\d+\s*(units|orders|sales)") or _has_pattern( answer, r"\d+\s*%" ) or _has_pattern(answer, r"from\s+\d+.*to\s+\d+"): score += 0.20 return min(score, 1.0) def _grade_fraud_detection(answer: str) -> float: score = 0.0 if _has_any(answer, ["southeast"]): score += 0.20 if _has_any(answer, [ "new account", "recent signup", "recently created", "new customer", "account creation", "registered in feb", "signed up", ]): score += 0.20 if _has_any(answer, [ "high-value", "high value", "expensive", "laptop pro", "desktop workstation", "large order", "electronics", ]): score += 0.20 if _has_pattern(answer, r"1[0-5]\s*(account|customer|user)") or _has_pattern( answer, r"\$\s*[\d,]+" ) or _has_pattern(answer, r"\d+\s*(order|transaction)"): score += 0.20 if _has_any(answer, [ "pattern", "cluster", "coordinated", "suspicious", "same product", "no return", "never returned", "concentrated", "anomal", "fraud ring", ]): score += 0.20 return min(score, 1.0) def _grade_repeat_purchase_decline(answer: str) -> float: score = 0.0 if _has_any(answer, [ "repeat purchase", "repeat rate", "returning customer", "repeat buyer", "repurchase", "order frequency", "second order", "came back", ]) and (_has_pattern(answer, r"\d+\s*%") or _has_any(answer, [ "decline", "drop", "decrease", "fell", "collapsed", ])): score += 0.20 if _has_any(answer, ["enterprise"]) and _has_any(answer, [ "price", "increase", "hike", "stopped", "left", "churn", ]): score += 0.20 if (_has_any(answer, ["midwest"]) or _has_any(answer, [ "shipping", "delivery", "quickship", ])) and _has_any(answer, [ "repeat", "return", "reorder", "come back", "second order", ]): score += 0.20 if _has_any(answer, ["marketing", "acquisition", "spend"]) and _has_any(answer, [ "retention", "email", "loyalty", "re-engage", "lapsed", "shifted", "new customer", ]): score += 0.20 if _has_any(answer, [ "segment", "cohort", "by region", "by segment", "enterprise vs", "consumer vs", "smb vs", ]) or _has_pattern(answer, r"(enterprise|smb|consumer).*\d+\s*%"): score += 0.20 return min(score, 1.0) TASKS: dict[str, dict] = { "orders_drop": { "id": "orders_drop", "difficulty": "easy", "title": "Weekly Orders Drop Investigation", "description": ( "URGENT -- Our order volume dropped sharply in the first two weeks " "of March compared to the last two weeks of February. Leadership " "needs to know why.\n\n" "Investigate the database, identify the root cause of the drop, " "and submit a clear summary of your findings." ), }, "returns_spike": { "id": "returns_spike", "difficulty": "medium", "title": "Product Returns Spike Investigation", "description": ( "ALERT -- Our return rate has spiked significantly in recent weeks, " "with particular concentration in one geographic region. This is " "eating into margins.\n\n" "Use the database to identify which product(s) are driving the " "spike, which region is most affected, and what the likely root " "cause is. Include the supplier if relevant." ), }, "customer_churn": { "id": "customer_churn", "difficulty": "hard", "title": "Customer Churn Root Cause Analysis", "description": ( "CRITICAL -- Our monthly active customer count has declined " "significantly from January to March. The executive team wants a " "full root-cause analysis.\n\n" "Determine which customer segments and regions are most affected, " "quantify the decline, and identify the most likely causes. " "Check all available tables for clues." ), }, "shipping_delay": { "id": "shipping_delay", "difficulty": "medium-hard", "title": "Customer Satisfaction Crisis Investigation", "description": ( "ESCALATION -- Customer satisfaction scores have plummeted in one " "of our regions. The support team is overwhelmed with complaints " "and escalations are piling up.\n\n" "Investigate what operational issue is driving the complaints, " "identify the responsible party (carrier, warehouse, etc.), " "determine when the problem started, and quantify the impact. " "Cross-reference multiple data sources for a complete picture." ), }, "revenue_paradox": { "id": "revenue_paradox", "difficulty": "hard", "title": "Revenue vs. Profit Paradox Investigation", "description": ( "CRITICAL -- Revenue in February was our highest month ever, yet " "gross profit actually *decreased* compared to January. The CFO " "wants a full breakdown of why we are selling more but earning " "less.\n\n" "Analyze revenue, costs, margins, discounts, product mix, customer " "segments, and any other relevant factors. This is likely multi-" "causal -- identify ALL contributing factors and quantify their " "impact. Use the products.cost column to compute margins." ), }, "supplier_quality": { "id": "supplier_quality", "difficulty": "medium", "title": "Supplier Quality Crisis Investigation", "description": ( "ESCALATION -- The VP of Merchandising has received escalating " "complaints about product quality across multiple SKUs. Quality " "Assurance wants a supplier-level analysis.\n\n" "Determine which supplier(s) have systemic quality issues, which " "of their products are affected, and quantify the total business " "impact in returns, refunds, and support ticket volume. Include " "return rates by supplier to support a contract renegotiation." ), }, "inventory_stockout": { "id": "inventory_stockout", "difficulty": "medium-hard", "title": "Regional Sales Underperformance Investigation", "description": ( "INVESTIGATION -- Our West region was projected to be the top " "performer during the Spring Mega Sale based on historical trends " "and marketing investment, but actual sales came in significantly " "below the other regions.\n\n" "The Regional VP demands an explanation. Investigate what caused " "the West to underperform during our biggest promotional event. " "Check product-level sales, inventory data, and any operational " "issues that may have limited fulfillment." ), }, "fraud_detection": { "id": "fraud_detection", "difficulty": "hard", "title": "Suspicious Order Pattern Investigation", "description": ( "ALERT -- The Finance team has flagged a suspicious spike in " "high-value orders from recently created accounts. Several of " "these orders have already shipped.\n\n" "Investigate the pattern: identify the suspicious accounts, " "determine the scope of potential fraud, estimate the financial " "exposure, and describe the behavioral signatures that " "distinguish these accounts from legitimate customers. Look at " "signup dates, order values, product choices, and geographic " "concentration." ), }, "repeat_purchase_decline": { "id": "repeat_purchase_decline", "difficulty": "hard", "title": "Customer Retention Crisis Investigation", "description": ( "CRITICAL -- Monthly unique buyer count has held steady around " "100, but the Customer Success team reports that repeat purchase " "rates have collapsed. In January, roughly 40%% of orders came " "from returning customers; by March, it appears to be under 20%%." "\n\n" "The CEO asks: are we becoming a one-time-purchase business? " "Diagnose which customer segments and regions lost repeat buyers, " "identify the root causes, and determine whether our marketing " "spend strategy is masking a retention problem. Check the " "marketing_spend table for clues about acquisition vs. retention " "investment." ), }, } _GRADERS: dict[str, Callable[[str], float]] = { "orders_drop": _grade_orders_drop, "returns_spike": _grade_returns_spike, "customer_churn": _grade_customer_churn, "shipping_delay": _grade_shipping_delay, "revenue_paradox": _grade_revenue_paradox, "supplier_quality": _grade_supplier_quality, "inventory_stockout": _grade_inventory_stockout, "fraud_detection": _grade_fraud_detection, "repeat_purchase_decline": _grade_repeat_purchase_decline, } def grade_answer(task_id: str, answer: str) -> float: grader = _GRADERS.get(task_id) if grader is None: return 0.05 raw = grader(answer) return max(0.05, min(0.95, raw))