File size: 3,712 Bytes
2312199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
{
  "task_id": "easy_01",
  "version": "1.0.0",
  "created_at": "2026-03-11",
  "metadata": {
    "domain": "credit_card_optimization",
    "difficulty": "easy",
    "task_number": 1,
    "complexity_hint": {
      "max_tokens": 4000,
      "expected_output": "single card recommendation with EV calculation"
    },
    "requires_human_review": true
  },
  "prompt": {
    "system": "",
    "user": "You are a financial advisor tasked with giving the user the optimal credit card advice for their situation.\n\nUser profile:  \n\\- $3,000 rent payment  \n\\- $2,900 in other general expenses each month  \n\\- Values extra Bilt cash not used for unlocking points on housing at $0.25 USD per $1 of Bilt cash, up to $50 of extra Bilt Cash, after which the user sees no value in additional Bilt Cash\n\nWhich of the two Bilt housing rewards options should the user choose and what is the user\u2019s expected value earned from this choice, ignoring the rest of the card\u2019s benefits? \n\nCard context: Bilt Blue, Bilt Obsidian, Bilt Palladium",
    "knowledge_base_ref": "knowledge_base.md",
    "kb_filter": [
      "Bilt Blue",
      "Bilt Obsidian",
      "Bilt Palladium"
    ],
    "system_prompt_ref": "system_prompt_template.md"
  },
  "scoring": {
    "dimensions": {
      "constraint_compliance": {
        "weight": 0.3,
        "type": "automated",
        "description": "Hard rule checks: velocity limits, eligibility, user constraints",
        "checks": {
          "velocity_rules": null,
          "eligibility_rules": null,
          "user_constraints": null,
          "expected_cards": [],
          "expected_housing_option": "flexible_bilt_cash",
          "key_constraints_flags": [
            "housing_option_choice"
          ]
        },
        "hard_constraint": false
      },
      "ev_accuracy": {
        "weight": 0.4,
        "type": "automated",
        "description": "EV calculation accuracy vs. reference solution",
        "reference": {
          "reference_ev_usd": 78.0,
          "ev_tolerance_pct": 0.05
        }
      },
      "reasoning_quality": {
        "weight": 0.2,
        "type": "human",
        "description": "Quality of tradeoff articulation and strategic reasoning (0-3 scale)",
        "rubric": {
          "0": "No reasoning or incorrect reasoning",
          "1": "Surface-level reasoning, misses key tradeoffs",
          "2": "Correct tradeoffs identified with clear justification",
          "3": "Expert-level nuance including edge cases and constraint interactions"
        },
        "score": null
      },
      "constraint_prioritization": {
        "weight": 0.1,
        "type": "human",
        "description": "Correct handling of ambiguity and conflicting constraints",
        "score": null
      }
    },
    "passing_threshold": 0.6,
    "hard_constraint_failure_zeroes_dimension": true
  },
  "reference_solution": {
    "_status": "EXPERT_REVIEWED",
    "recommended_cards": [],
    "total_ev_usd": 78.0,
    "ev_breakdown": {
      "signup_bonuses_usd": 0.0,
      "ongoing_rewards_usd": 0.0,
      "credits_usd": 0.0,
      "annual_fees_usd": 0.0,
      "other_usd": 78.0
    },
    "housing_option": "flexible_bilt_cash",
    "key_constraints_flags": [
      "housing_option_choice"
    ],
    "expert_notes": "Flexible Bilt Cash option is correct. 4% Bilt Cash on $2,900 = $116/mo. $90/mo used to unlock 3,000 rent points (same as tiered). $26/mo excess Bilt Cash. EV depends on interpretation of '$50 of extra Bilt Cash' limit: per-month interpretation \u2192 $26*12*$0.25 = $78/yr; per-year interpretation \u2192 $50*$0.25 = $12.50/yr. Reference uses per-month ($78). Accept either interpretation with correct math."
  }
}