File size: 3,704 Bytes
2312199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
{
  "task_id": "objective_easy_02",
  "version": "2.0.0",
  "created_at": "2026-03-12",
  "metadata": {
    "domain": "credit_card_optimization",
    "difficulty": "easy",
    "task_number": 2,
    "complexity_hint": {
      "max_tokens": 4000,
      "expected_output": "single card recommendation for a non-traveler with EV calculation"
    },
    "requires_human_review": false
  },
  "prompt": {
    "system": "",
    "user": "You are a financial advisor. Recommend the single best credit card for this user and calculate the expected value over 12 months.\n\nUser profile:\n- Monthly spending: $300 dining, $800 groceries, $0 travel/flights/hotels, $200 gas, $80 streaming, $0 transit, $1200 everything else\n- Does NOT travel (0 trips per year, no lounge use)\n- Does NOT use Uber, rideshare, or food delivery apps\n- Uses streaming services (Netflix, Spotify)\n- No existing credit cards\n- Time horizon: 12 months\n\nThis user wants the simplest, highest-value card with no annual fee. Provide a detailed EV breakdown.",
    "knowledge_base_ref": "knowledge_base.md",
    "kb_filter": [
      "Chase Freedom Unlimited",
      "Chase Freedom Flex",
      "Citi Double Cash",
      "Amex Blue Cash Preferred",
      "Capital One Savor",
      "BofA Customized Cash Rewards",
      "Wells Fargo Autograph"
    ],
    "system_prompt_ref": "system_prompt_template.md"
  },
  "scoring": {
    "dimensions": {
      "card_selection": {
        "weight": 0.25,
        "type": "automated",
        "description": "F1 of recommended cards vs. optimal set",
        "checks": {
          "expected_cards": [
            "Amex Blue Cash Preferred"
          ]
        },
        "hard_constraint": false
      },
      "ev_accuracy": {
        "weight": 0.3,
        "type": "automated",
        "description": "EV accuracy vs. computed ground truth",
        "reference": {
          "user_profile": {
            "monthly_spend": {
              "dining": 300,
              "groceries": 800,
              "travel": 0,
              "flights": 0,
              "hotels": 0,
              "gas": 200,
              "streaming": 80,
              "transit": 0,
              "everything_else": 1200
            },
            "uses_streaming": true,
            "time_horizon_months": 12
          },
          "expected_card_ids": [
            "amex_blue_cash_preferred"
          ],
          "ev_tolerance_pct": 0.05
        }
      },
      "factual_fidelity": {
        "weight": 0.3,
        "type": "automated",
        "description": "Accuracy of factual claims about cards",
        "reference": {
          "extracted_claims": null,
          "reference_ev_usd": null
        }
      },
      "constraint_compliance": {
        "weight": 0.15,
        "type": "automated",
        "description": "Respects user constraints (no annual fee preference, no travel cards)",
        "checks": {
          "expected_cards": [
            "Amex Blue Cash Preferred"
          ],
          "expected_housing_option": null
        },
        "hard_constraint": false
      }
    },
    "passing_threshold": 0.6,
    "hard_constraint_failure_zeroes_dimension": true
  },
  "reference_solution": {
    "_status": "COMPUTED",
    "recommended_cards": [
      "Amex Blue Cash Preferred"
    ],
    "total_ev_usd": null,
    "ev_breakdown": null,
    "housing_option": null,
    "key_constraints_flags": [
      "no_annual_fee_preference",
      "non_traveler"
    ],
    "expert_notes": "Ground truth EV computed from card_database.json. Blue Cash Preferred wins due to 6% groceries on $800/mo ($576/yr), 6% streaming ($57.60/yr), 3% gas ($72/yr). First year fee waived. Disney streaming credit usable."
  }
}