"""Task and evaluation schemas for Harbor RL environment (credit card optimization).""" from __future__ import annotations from typing import Any, Literal from pydantic import Field from lexenvs.schemas.base import InputSchema, ResponseSchema # --------------------------------------------------------------------------- # Internal models — loaded from JSON, never exposed via API # --------------------------------------------------------------------------- DimensionType = Literal["automated", "human"] DimensionStatus = Literal["scored", "pending_reference", "requires_human"] class ComplexityHint(ResponseSchema): max_tokens: int expected_output: str class TaskMetadata(ResponseSchema): domain: str difficulty: Literal["easy", "medium", "hard"] task_number: int complexity_hint: ComplexityHint requires_human_review: bool = False class TaskPrompt(ResponseSchema): """Prompt delivered to the agent. ``context`` is resolved at load time.""" system: str = "" system_prompt_ref: str | None = None context: str = "" user: str knowledge_base_ref: str | None = None kb_filter: list[str] | None = None class ScoringDimension(ResponseSchema): weight: float type: DimensionType description: str checks: dict[str, Any] | None = None reference: dict[str, Any] | None = None rubric: dict[str, str] | None = None score: float | None = None hard_constraint: bool = False class Scoring(ResponseSchema): dimensions: dict[str, ScoringDimension] passing_threshold: float hard_constraint_failure_zeroes_dimension: bool = True class EVBreakdown(ResponseSchema): signup_bonuses_usd: float | None = None ongoing_rewards_usd: float | None = None credits_usd: float | None = None annual_fees_usd: float | None = None other_usd: float | None = None class ReferenceSolution(ResponseSchema): status: str = Field(alias="_status") recommended_cards: list[str] | None = None total_ev_usd: float | None = None ev_breakdown: EVBreakdown | None = None housing_option: str | None = None key_constraints_flags: list[str] | None = None expert_notes: str | None = None class TaskDefinition(ResponseSchema): """Full internal task definition — never exposed via API.""" task_id: str version: str = "1.0.0" created_at: str | None = None metadata: TaskMetadata prompt: TaskPrompt scoring: Scoring reference_solution: ReferenceSolution # --------------------------------------------------------------------------- # API response models — hide reference solution & scoring internals # --------------------------------------------------------------------------- class TaskResponse(ResponseSchema): """List view — enough to identify and select a task.""" task_id: str domain: str difficulty: str task_number: int class TaskDetailResponse(TaskResponse): """Detail view — full prompt for the agent to work with.""" system_prompt: str context: str user_prompt: str max_tokens: int | None = None class TaskListResponse(ResponseSchema): """List of available tasks.""" tasks: list[TaskResponse] total: int # --------------------------------------------------------------------------- # Evaluation request / response # --------------------------------------------------------------------------- class EvaluateRequest(InputSchema): """Agent submits answer as raw text (should contain a JSON block).""" answer: str class DimensionResult(ResponseSchema): """Per-dimension scoring result.""" dimension: str score: float | None = None weight: float status: DimensionStatus class TaskResultResponse(ResponseSchema): """Evaluation result returned to the agent.""" task_id: str reward: float = Field(ge=0.0, le=1.0) dimensions: list[DimensionResult] = Field(default_factory=list) metadata: dict[str, Any] = Field(default_factory=dict)