name: InferenceGym
version: "1.0.0"
description: >
  OpenEnv-compliant RL environment for LLM inference serving optimization.
  Teaches agents to make real-time serving configuration decisions for LLM
  infrastructure using trace-driven simulation grounded in Orca, vLLM, and Decima.
author: team-llmserve
tags:
  - openenv
  - rl
  - llm
  - inference
  - serving
endpoints:
  reset: /reset
  step: /step
  state: /state
  tasks: /tasks
  grader: /grader
  baseline: /baseline
  health: /health
tasks:
  - id: static_workload
    name: Static Uniform Workload
    description: "Steady 10 rps traffic with uniform prompt lengths. Tests basic queue pressure response."
    difficulty: easy
    episode_length: 200
    slo_thresholds:
      p99_ttft_ms: 500
  - id: bursty_workload
    name: Bursty ShareGPT Workload
    description: "Alternating quiet/burst phases with real ShareGPT prompt distributions. Tests non-stationary traffic adaptation."
    difficulty: medium
    episode_length: 120
    slo_thresholds:
      p99_ttft_ms: 300
  - id: adversarial_multitenant
    name: Adversarial Multi-Tenant Serving
    description: "Sinusoidal arrival with mega-prompt injections and multi-priority routing. Challenges frontier models."
    difficulty: hard
    episode_length: 200
    slo_thresholds:
      p99_ttft_ms: 200
observation_space:
  - { name: queue_depth, type: int, min: 0, max: 10000 }
  - { name: active_requests, type: int, min: 0, max: 512 }
  - { name: kv_cache_occupancy, type: float, min: 0.0, max: 1.0 }
  - { name: mean_prompt_length, type: float, min: 0.0, max: 10000.0 }
  - { name: p50_ttft_ms, type: float, min: 0.0, max: 10000.0 }
  - { name: p99_ttft_ms, type: float, min: 0.0, max: 10000.0 }
  - { name: p50_itl_ms, type: float, min: 0.0, max: 1000.0 }
  - { name: throughput_tps, type: float, min: 0.0, max: 1000.0 }
  - { name: slo_compliance_rate, type: float, min: 0.0, max: 1.0 }
  - { name: gpu_memory_used_gb, type: float, min: 0.0, max: 80.0 }
  - { name: estimated_cost_per_1k, type: float, min: 0.0, max: 1.0 }
  - { name: request_arrival_rate, type: float, min: 0.0, max: 500.0 }
  - { name: spec_acceptance_rate, type: float, min: 0.0, max: 1.0 }
  - { name: eviction_events, type: int, min: 0, max: 1000 }
  - { name: step_index, type: int, min: 0, max: 200 }
  - { name: task_id, type: string }
action_space:
  - { name: batch_cap, type: int, min: 1, max: 512 }
  - { name: kv_budget_fraction, type: float, min: 0.1, max: 1.0 }
  - { name: speculation_depth, type: int, min: 0, max: 8 }
  - { name: quantization_tier, type: enum, values: [FP16, INT8, INT4] }
  - { name: prefill_decode_split, type: bool }
  - { name: priority_routing, type: bool }
reward_range: [-1.0, 1.0]
grader_range: [0.0, 1.0]