Spaces:

endishai
/

lexenvs-harbor

Sleeping

File size: 4,014 Bytes
"""Task and evaluation schemas for Harbor RL environment (credit card optimization)."""

from __future__ import annotations

from typing import Any, Literal

from pydantic import Field

from lexenvs.schemas.base import InputSchema, ResponseSchema

# ---------------------------------------------------------------------------
# Internal models — loaded from JSON, never exposed via API
# ---------------------------------------------------------------------------

DimensionType = Literal["automated", "human"]
DimensionStatus = Literal["scored", "pending_reference", "requires_human"]


class ComplexityHint(ResponseSchema):
    max_tokens: int
    expected_output: str


class TaskMetadata(ResponseSchema):
    domain: str
    difficulty: Literal["easy", "medium", "hard"]
    task_number: int
    complexity_hint: ComplexityHint
    requires_human_review: bool = False


class TaskPrompt(ResponseSchema):
    """Prompt delivered to the agent. ``context`` is resolved at load time."""

    system: str = ""
    system_prompt_ref: str | None = None
    context: str = ""
    user: str
    knowledge_base_ref: str | None = None
    kb_filter: list[str] | None = None


class ScoringDimension(ResponseSchema):
    weight: float
    type: DimensionType
    description: str
    checks: dict[str, Any] | None = None
    reference: dict[str, Any] | None = None
    rubric: dict[str, str] | None = None
    score: float | None = None
    hard_constraint: bool = False


class Scoring(ResponseSchema):
    dimensions: dict[str, ScoringDimension]
    passing_threshold: float
    hard_constraint_failure_zeroes_dimension: bool = True


class EVBreakdown(ResponseSchema):
    signup_bonuses_usd: float | None = None
    ongoing_rewards_usd: float | None = None
    credits_usd: float | None = None
    annual_fees_usd: float | None = None
    other_usd: float | None = None


class ReferenceSolution(ResponseSchema):
    status: str = Field(alias="_status")
    recommended_cards: list[str] | None = None
    total_ev_usd: float | None = None
    ev_breakdown: EVBreakdown | None = None
    housing_option: str | None = None
    key_constraints_flags: list[str] | None = None
    expert_notes: str | None = None


class TaskDefinition(ResponseSchema):
    """Full internal task definition — never exposed via API."""

    task_id: str
    version: str = "1.0.0"
    created_at: str | None = None
    metadata: TaskMetadata
    prompt: TaskPrompt
    scoring: Scoring
    reference_solution: ReferenceSolution


# ---------------------------------------------------------------------------
# API response models — hide reference solution & scoring internals
# ---------------------------------------------------------------------------


class TaskResponse(ResponseSchema):
    """List view — enough to identify and select a task."""

    task_id: str
    domain: str
    difficulty: str
    task_number: int


class TaskDetailResponse(TaskResponse):
    """Detail view — full prompt for the agent to work with."""

    system_prompt: str
    context: str
    user_prompt: str
    max_tokens: int | None = None


class TaskListResponse(ResponseSchema):
    """List of available tasks."""

    tasks: list[TaskResponse]
    total: int


# ---------------------------------------------------------------------------
# Evaluation request / response
# ---------------------------------------------------------------------------


class EvaluateRequest(InputSchema):
    """Agent submits answer as raw text (should contain a JSON block)."""

    answer: str


class DimensionResult(ResponseSchema):
    """Per-dimension scoring result."""

    dimension: str
    score: float | None = None
    weight: float
    status: DimensionStatus


class TaskResultResponse(ResponseSchema):
    """Evaluation result returned to the agent."""

    task_id: str
    reward: float = Field(ge=0.0, le=1.0)
    dimensions: list[DimensionResult] = Field(default_factory=list)
    metadata: dict[str, Any] = Field(default_factory=dict)