Spaces:

rampluto
/

medusa_env

Running

App Files Files Community

rampluto commited on 14 days ago

Commit

fe7e1aa

verified ·

1 Parent(s): fd09b74

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

README.md +16 -7
__init__.py +2 -0
client.py +1 -1
inference.py +266 -0
openenv.yaml +64 -0
run_episode.py +88 -0
scenarios.py +5 -1
scripts/env_test.py +20 -0
scripts/hf_test.py +17 -0
server/app.py +1 -2
server/medusa_env.py +40 -14
validate.sh +185 -0

README.md CHANGED Viewed

@@ -132,7 +132,7 @@ Random seeds produce blended variants.
 ```bash
 # Clone / navigate to repo
-cd /path/to/OpenEnv
 # Create venv and install all deps (including pandas, numpy)
 uv sync
@@ -148,23 +148,25 @@ source .venv/bin/activate
 ### Start the FastAPI server
 ```bash
-uvicorn envs.medusa_env.server.app:app --reload --host 0.0.0.0 --port 8000
 ```
 API docs available at `http://localhost:8000/docs`.
 ### Run tests
 ```bash
 python -m pytest tests/envs/test_medusa_environment.py -v
-# 39 passed in ~4s
 ```
 ### Run a manual episode (Python)
 ```python
-from envs.medusa_env import MedusaEnv, MedusaAction
-from envs.medusa_env.models import MedusaActionType
 env = MedusaEnv(n_fact_rows=200, n_dim_rows=150)
 obs = env.reset(seed=0)  # seed 0 = clean scenario
@@ -186,6 +188,13 @@ for action_type in [
 print(f"\nGrader: {env.state.grader_report}")
 ```
 ---
 ## Architecture
@@ -193,7 +202,6 @@ print(f"\nGrader: {env.state.grader_report}")
 ```
 envs/medusa_env/
 ├── __init__.py          # Package exports
-├── medusa_env.py        # MedusaEnv — reset / step / commit loop
 ├── models.py            # MedusaAction, MedusaObservation, MedusaState (Pydantic)
 ├── scenarios.py         # ScenarioGenerator — procedural Bronze A/B DataFrames
 ├── operators.py         # Stateless ETL functions (sync_check, prep_keys, execute_join, apply_scd …)
@@ -202,6 +210,7 @@ envs/medusa_env/
 ├── openenv.yaml         # OpenEnv environment manifest
 └── server/
     └── app.py           # FastAPI app via create_app()
 tests/envs/
 └── test_medusa_environment.py   # 39 tests across 6 test classes

 ```bash
 # Clone / navigate to repo
+cd Medusa
 # Create venv and install all deps (including pandas, numpy)
 uv sync
 ### Start the FastAPI server
 ```bash
+openenv validate
+openenv build --tag openenv-medusa
+docker run -p 8000:8000 openenv-medusa:latest
 ```
 API docs available at `http://localhost:8000/docs`.
+Playground available at `https://localhost:8000/web`
 ### Run tests
 ```bash
 python -m pytest tests/envs/test_medusa_environment.py -v
+# 53 passed in ~4s
 ```
 ### Run a manual episode (Python)
 ```python
+from medusa_env.server import MedusaEnv
+from medusa_env.models import MedusaActionType, MedusaAction
 env = MedusaEnv(n_fact_rows=200, n_dim_rows=150)
 obs = env.reset(seed=0)  # seed 0 = clean scenario
 print(f"\nGrader: {env.state.grader_report}")
 ```
+### steps to push to hugging face
+```bash
+openenv push --repo-id <hf_username>/<hf_space>
+```
+Huggingface BASE_URL="https://<hf_username>-<hf_space>.hf.space"
 ---
 ## Architecture
 ```
 envs/medusa_env/
 ├── __init__.py          # Package exports
 ├── models.py            # MedusaAction, MedusaObservation, MedusaState (Pydantic)
 ├── scenarios.py         # ScenarioGenerator — procedural Bronze A/B DataFrames
 ├── operators.py         # Stateless ETL functions (sync_check, prep_keys, execute_join, apply_scd …)
 ├── openenv.yaml         # OpenEnv environment manifest
 └── server/
     └── app.py           # FastAPI app via create_app()
+    ├── medusa_env.py        # MedusaEnv — reset / step / commit loop
 tests/envs/
 └── test_medusa_environment.py   # 39 tests across 6 test classes

__init__.py CHANGED Viewed

@@ -15,9 +15,11 @@ from .models import MedusaAction, MedusaActionType, MedusaObservation, MedusaSta
 from .rewards import RewardEngine
 from .scenarios import Scenario, ScenarioGenerator
 from .tasks import TASKS, Task, TaskResult, score_episode
 __all__ = [
     "medusa_env",
     "MedusaAction",
     "MedusaActionType",
     "MedusaObservation",

 from .rewards import RewardEngine
 from .scenarios import Scenario, ScenarioGenerator
 from .tasks import TASKS, Task, TaskResult, score_episode
+from server.medusa_env import MedusaEnv
 __all__ = [
     "medusa_env",
+    "MedusaEnv"
     "MedusaAction",
     "MedusaActionType",
     "MedusaObservation",

client.py CHANGED Viewed

@@ -28,7 +28,7 @@ try:
     from openenv.core.client_types import StepResult
     from openenv.core.env_client import EnvClient
-    from .models import MedusaAction, MedusaObservation, MedusaState
 except ImportError:
     from models import MedusaAction, MedusaObservation, MedusaState

     from openenv.core.client_types import StepResult
     from openenv.core.env_client import EnvClient
+    from medusa_env.models import MedusaAction, MedusaObservation, MedusaState
 except ImportError:
     from models import MedusaAction, MedusaObservation, MedusaState

inference.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""MEDUSA inference script — OpenEnv Hackathon submission.
+Runs an LLM agent (via OpenAI-compatible API) against all three MEDUSA tasks
+and reports per-task scores (0.0–1.0).
+Required environment variables:
+    API_BASE_URL   The API endpoint for the LLM (OpenAI-compatible). Defaults to https://router.huggingface.co/v1
+    MODEL_NAME     The model identifier to use for inference.
+    HF_TOKEN       Your Hugging Face / API key (also accepts API_KEY).
+Usage:
+    export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
+    export HF_TOKEN="hf-..."
+    python inference.py
+Output:
+    Prints per-task results and a final summary table to stdout.
+    Exits with code 0 if all tasks score >= 0.35, else 1.
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+import textwrap
+import time
+from typing import List, Optional
+# ---------------------------------------------------------------------------
+# Validate required environment variables before anything else
+# ---------------------------------------------------------------------------
+API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or "mock-key"
+MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
+TASK_NAME = os.getenv("TASK_NAME", "clean_pipeline")
+BENCHMARK = os.getenv("BENCHMARK", "medusa_env")
+_missing = [k for k, v in {
+    "API_BASE_URL": API_BASE_URL,
+    "MODEL_NAME": MODEL_NAME,
+    "API_KEY (or HF_TOKEN)": API_KEY,
+}.items() if not v]
+if _missing:
+    print(f"ERROR: Missing required environment variables: {', '.join(_missing)}", file=sys.stderr)
+    print("Set them before running:", file=sys.stderr)
+    for k in _missing:
+        print(f"  export {k}=<value>", file=sys.stderr)
+    sys.exit(1)
+# ---------------------------------------------------------------------------
+# OpenAI client (uses API_BASE_URL + HF_TOKEN as the key)
+# ---------------------------------------------------------------------------
+from openai import OpenAI  # noqa: E402
+client = OpenAI(
+    base_url=API_BASE_URL,
+    api_key=API_KEY,
+)
+# ---------------------------------------------------------------------------
+# MEDUSA environment imports
+# ---------------------------------------------------------------------------
+from pathlib import Path
+# Dynamically add the OpenEnv repo root to sys.path so absolute imports work
+# no matter where this script is executed from.
+repo_root = str(Path(__file__).resolve().parent.parent.parent)
+if repo_root not in sys.path:
+    sys.path.insert(0, repo_root)
+try:
+    # In-repo
+    from envs.medusa_env import MedusaEnv
+    from envs.medusa_env.models import MedusaAction, MedusaActionType
+    from envs.medusa_env.tasks import TASKS, TaskResult, score_episode
+except ImportError:
+    # Standalone (running from inside envs/medusa_env/ installation)
+    from medusa_env import MedusaEnv  # type: ignore
+    from models import MedusaAction, MedusaActionType  # type: ignore
+    from tasks import TASKS, TaskResult, score_episode  # type: ignore
+# ---------------------------------------------------------------------------
+# System prompt
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPT = textwrap.dedent("""
+You are a data integration agent controlling a Bronze→Silver ETL pipeline.
+You observe a 16-float feature vector describing data quality signals, and
+you must choose one action per step from the list below.
+ACTIONS (respond with ONLY the action name — nothing else):
+  SYNC_CHECK          — Verify source freshness before processing
+  EVOLVE_SCHEMA       — Add new columns from sources into Silver schema
+  PREP_KEYS_A         — Clean and normalise join keys in Source A (Fact)
+  PREP_KEYS_B         — Clean and normalise join keys in Source B (Dimension)
+  DEDUPLICATE_B       — Remove duplicate keys from Source B
+  EXECUTE_JOIN_INNER  — Inner join A ⋈ B
+  EXECUTE_JOIN_LEFT   — Left join A ⋈ B (keeps all Fact rows; orphans → quarantine)
+  EXECUTE_JOIN_ANTI   — Anti-join: extract Fact rows with no Dimension match
+  APPLY_SCD_1         — Overwrite Silver records (SCD Type 1)
+  APPLY_SCD_2         — Close old records and insert new with timestamps (SCD Type 2)
+  COMMIT              — Finalise pipeline and trigger audit
+STRATEGY:
+1. Always call SYNC_CHECK first to verify freshness.
+2. If schema drift signals are non-zero (features[9] or [10] > 0), call EVOLVE_SCHEMA.
+3. If null key ratios (features[4] or [5] > 0), call PREP_KEYS_A and/or PREP_KEYS_B.
+4. If Dimension uniqueness (features[7]) < 1.0, call DEDUPLICATE_B.
+5. Prefer EXECUTE_JOIN_LEFT to preserve all Fact rows.
+6. Prefer APPLY_SCD_2 for tracked history.
+7. Call COMMIT when pipeline is complete.
+The feature vector indices:
+  [0]  time_delta_a_norm   [1]  time_delta_b_norm
+  [2]  is_stale_a          [3]  is_stale_b
+  [4]  null_ratio_key_a    [5]  null_ratio_key_b
+  [6]  uniqueness_a        [7]  uniqueness_b
+  [8]  match_rate          [9]  new_cols_a_norm
+  [10] new_cols_b_norm     [11] schema_compat
+  [12] did_prep_a          [13] did_prep_b
+  [14] did_dedup_b         [15] step_frac
+""").strip()
+# ---------------------------------------------------------------------------
+# LLM action chooser
+# ---------------------------------------------------------------------------
+VALID_ACTIONS = {a.value for a in MedusaActionType}
+def choose_action(
+    features: List[float],
+    history: List[dict],
+    step: int,
+) -> str:
+    """Ask the LLM to choose the next action given the current observation."""
+    feature_str = ", ".join(f"{v:.3f}" for v in features)
+    user_msg = (
+        f"Step {step}. Feature vector: [{feature_str}]\n"
+        "What is the single best next action? Respond with ONLY the action name."
+    )
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # Include the last 4 steps of history for context (keep prompt short)
+    for h in history[-4:]:
+        messages.append({"role": "user", "content": h["user"]})
+        messages.append({"role": "assistant", "content": h["assistant"]})
+    messages.append({"role": "user", "content": user_msg})
+    response = client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        # max_tokens=20,
+        max_completion_tokens=256,
+        temperature=0.1,
+    )
+    raw = response.choices[0].message.content.strip().upper().replace(" ", "_")
+    # Fuzzy match: accept if the response contains a valid action name
+    for action in VALID_ACTIONS:
+        if action in raw:
+            return action
+    # Fallback: extract the longest matching token
+    for action in sorted(VALID_ACTIONS, key=len, reverse=True):
+        if action.replace("_", "") in raw.replace("_", ""):
+            return action
+    # Hard fallback: commit to end gracefully
+    return MedusaActionType.COMMIT.value
+# ---------------------------------------------------------------------------
+# Logging Functions (Hackathon STDOut Format)
+# ---------------------------------------------------------------------------
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
+# ---------------------------------------------------------------------------
+# Run one task
+# ---------------------------------------------------------------------------
+def run_task(task_id: str, max_steps: int = 15) -> None:
+    """Run the LLM agent for one MEDUSA task using required hackathon STDOUT format."""
+    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
+    task = TASKS[task_id]
+    env = MedusaEnv(n_fact_rows=200, n_dim_rows=150, max_steps=max_steps)
+    obs = env.reset(seed=task.seed)
+    history: List[dict] = []
+    rewards_list: List[float] = []
+    step = 0
+    success = False
+    score = 0.0
+    try:
+        while not obs.done and step < max_steps:
+            step += 1
+            action_str = choose_action(obs.features, history, step)
+            # Since the environment throws errors on bad actions, we just pass the action string.
+            try:
+                action_type = MedusaActionType(action_str)
+            except ValueError:
+                action_type = MedusaActionType.COMMIT # default fallback
+            action = MedusaAction(action=action_type)
+            obs = env.step(action)
+            reward = obs.reward or 0.0
+            rewards_list.append(reward)
+            log_step(step=step, action=action_str, reward=reward, done=obs.done, error=None)
+            history.append({
+                "user": (f"Step {step}. Features: [{', '.join(f'{v:.3f}' for v in obs.features)}]"
+                         " What action?"),
+                "assistant": action_str,
+            })
+            if obs.done:
+                break
+        # Tally final score via grader
+        result = score_episode(task_id, env.state, env._tables)
+        score = result.score
+        success = result.passed
+    except Exception as e:
+        log_step(step=step+1 if step > 0 else 1, action="ERROR", reward=0.0, done=True, error=str(e))
+    finally:
+        log_end(success=success, steps=step, score=score, rewards=rewards_list)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    # Do not loop task variants anymore; run dynamically via TASK_NAME
+    run_task(TASK_NAME)
+if __name__ == "__main__":
+    main()

openenv.yaml CHANGED Viewed

@@ -4,3 +4,67 @@ type: space
 runtime: fastapi
 app: server.app:app
 port: 8000

 runtime: fastapi
 app: server.app:app
 port: 8000
+tasks:
+  - id: clean_pipeline
+    name: Clean Pipeline
+    difficulty: easy
+    seed: 0
+    description: >
+      Both sources are fresh. Join keys are clean and unique. The agent must
+      verify freshness, prepare keys, join, apply SCD, and commit without
+      triggering a row explosi
+    success_criteria:
+      - COMMIT issued (episode finalized)
+      - No Cartesian explosion detected
+      - Silver row count <= Source A row count
+      - match_rate > 0.80 after join
+    scoring_rubric:
+      committed: 0.20
+      no_explosion: 0.25
+      volume_ok: 0.20
+      high_match: 0.20
+      grader_pass: 0.15
+  - id: dirty_integration
+    name: Dirty Key Integration
+    difficulty: medium
+    seed: 1
+    description: >
+      Source A has NULLs and whitespace in join keys. Source B has duplicate
+      keys that can cause row explosion. The agent must PREP_KEYS and
+      DEDUPLICATE before joining, and correctly quarantine unresolvable
+      orphans.
+    success_criteria:
+      - PREP_KEYS_A issued before EXECUTE_JOIN
+      - PREP_KEYS_B issued before EXECUTE_JOIN
+      - DEDUPLICATE_B issued before EXECUTE_JOIN
+      - No row explosion
+      - Quarantine integrity check passes
+    scoring_rubric:
+      committed: 0.10
+      prepped_before_join: 0.20
+      deduped_before_join: 0.20
+      no_explosion: 0.25
+      integrity_ok: 0.15
+      grader_pass: 0.10
+  - id: full_medallion
+    name: Full Medallion Integration
+    difficulty: hard
+    seed: 2
+    description: >
+      Source A is stale (>6h old). Source B has new schema columns not
+      registered in Silver. The agent must check freshness, evolve the schema,
+      clean keys, deduplicate, execute a left join, apply SCD-2 for tracked
+      columns, and pass all grader checks.
+    success_criteria:
+      - SYNC_CHECK issued before any join
+      - EVOLVE_SCHEMA issued before COMMIT
+      - SCD-2 applied (not SCD-1) for tracked column
+      - Silver schema contains new columns from drift
+      - All 4 grader checks pass
+    scoring_rubric:
+      committed: 0.05
+      sync_checked: 0.15
+      schema_evolved: 0.15
+      used_scd2: 0.20
+      schema_ok: 0.20
+      grader_pass: 0.25

run_episode.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import random
+# Support both installed package usage (medusa_env) and in-repo local modules
+try:
+    from medusa_env import (
+        medusa_env,
+        MedusaAction,
+        MedusaActionType,
+        MedusaObservation,
+    )
+except ImportError:
+    # Fallback to local modules when running from the repo root without installing
+    from client import medusa_env
+    from models import MedusaAction, MedusaActionType, MedusaObservation
+MEDUSA_URL = 'https://anubhavkamal-medusa-env.hf.space'
+# MEDUSA_URL = 'http://localhost:8000'
+class RandomPolicy:
+    """Pure random — baseline for MEDUSA."""
+    name = "Random"
+    def select_action(self, obs: MedusaObservation) -> MedusaActionType:
+        # Pick randomly from the 11 valid operators
+        return random.choice(list(MedusaActionType))
+class AlwaysCommitPolicy:
+    """Immediately terminates the episode by committing."""
+    name = "Always Commit"
+    def select_action(self, obs: MedusaObservation) -> MedusaActionType:
+        return MedusaActionType.COMMIT
+class CleanPipelinePolicy:
+    """Hardcoded sequence to perfectly solve the Easy (Clean Pipeline) task."""
+    name = "Clean Pipeline Heuristic"
+    def __init__(self):
+        # The correct sequence of operations for the clean pipeline scenario
+        self.sequence = [
+            MedusaActionType.SYNC_CHECK,
+            MedusaActionType.PREP_KEYS_A,
+            MedusaActionType.PREP_KEYS_B,
+            MedusaActionType.EXECUTE_JOIN_LEFT,
+            MedusaActionType.APPLY_SCD_2,
+            MedusaActionType.COMMIT
+        ]
+        self.step = 0
+    def select_action(self, obs: MedusaObservation) -> MedusaActionType:
+        if self.step < len(self.sequence):
+            action = self.sequence[self.step]
+            self.step += 1
+            return action
+        return MedusaActionType.COMMIT
+print("Policies defined: Random, Always Commit, Clean Pipeline Heuristic")
+def run_episode(env, policy, seed=0, verbose=False):
+    """Play one episode. Returns the final reward (-1.0 to 1.0)."""
+    result = env.reset(seed=seed)
+    step = 0
+    while not result.done:
+        action_type = policy.select_action(result.observation)
+        if verbose:
+            print(f'  Step {step}: {action_type.value}')
+        result = env.step(MedusaAction(action=action_type))
+        step += 1
+    if verbose:
+        print(f'  Result: Done (reward={result.reward})')
+        print(f'  Terminal Message: {result.observation.message}')
+        if result.observation.metrics:
+            print(f'  Final Grade: {result.observation.metrics.get("grader_report")}')
+    return result.reward
+# Demo: one verbose episode with CleanPipelinePolicy
+with medusa_env(base_url=MEDUSA_URL).sync() as env:
+    print('\nTesting Clean Pipeline Policy — single episode (seed=0):')
+    run_episode(env, CleanPipelinePolicy(), seed=0, verbose=True)

scenarios.py CHANGED Viewed

@@ -85,7 +85,10 @@ def _make_dim(
     if match_keys:
         # Choose from overlap pool to control referential integrity
         available = list(match_keys)
-        keys = [rng.choice(available) for _ in range(n_rows)]
     else:
         keys = [f"K{i:04d}" for i in rng.sample(range(1, n_rows * 3), n_rows)]
@@ -213,3 +216,4 @@ class ScenarioGenerator:
                 new_cols_a=extra_a, new_cols_b=extra_b,
                 description="Schema drift: new columns in A and B.",
             )

     if match_keys:
         # Choose from overlap pool to control referential integrity
         available = list(match_keys)
+        if len(available) >= n_rows:
+            keys = rng.sample(available, n_rows)
+        else:
+            keys = [rng.choice(available) for _ in range(n_rows)]
     else:
         keys = [f"K{i:04d}" for i in rng.sample(range(1, n_rows * 3), n_rows)]
                 new_cols_a=extra_a, new_cols_b=extra_b,
                 description="Schema drift: new columns in A and B.",
             )

scripts/env_test.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from medusa_env import MedusaActionType, MedusaAction, MedusaEnv
+env = MedusaEnv(n_fact_rows=200, n_dim_rows=150)
+obs = env.reset(seed=0)  # seed 0 = clean scenario
+print(obs.message)
+for action_type in [
+    MedusaActionType.SYNC_CHECK,
+    MedusaActionType.EVOLVE_SCHEMA,
+    MedusaActionType.PREP_KEYS_A,
+    MedusaActionType.PREP_KEYS_B,
+    MedusaActionType.DEDUPLICATE_B,
+    MedusaActionType.EXECUTE_JOIN_LEFT,
+    MedusaActionType.APPLY_SCD_2,
+    MedusaActionType.COMMIT,
+]:
+    obs = env.step(MedusaAction(action=action_type))
+    print(f"{action_type.value:25s} reward={obs.reward:+.1f}  done={obs.done}")
+print(f"\nGrader: {env.state.grader_report}")

scripts/hf_test.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+import requests
+HF_TOKEN = os.environ["HF_TOKEN"]
+BASE_URL="https://rampluto-medusa-env.hf.space"
+session = requests.Session()
+session.headers.update({
+    "Authorization": f"Bearer {HF_TOKEN}",
+    "Content-Type": "application/json",
+})
+session.verify = False
+r = session.post(f"{BASE_URL}/reset", timeout=30)
+print("reset:", r.status_code, r.text)

server/app.py CHANGED Viewed

@@ -15,8 +15,7 @@ from __future__ import annotations
 #   2. Standalone installed (uv run server): medusa_env.* package
 #   3. Direct execution inside env dir: bare module names
 from openenv.core.env_server.http_server import create_app
-from medusa_env.server import MedusaEnv
-from medusa_env.models import MedusaAction, MedusaObservation
 app = create_app(
     MedusaEnv,

 #   2. Standalone installed (uv run server): medusa_env.* package
 #   3. Direct execution inside env dir: bare module names
 from openenv.core.env_server.http_server import create_app
+from medusa_env import MedusaEnv, MedusaAction, MedusaObservation
 app = create_app(
     MedusaEnv,

server/medusa_env.py CHANGED Viewed

@@ -22,18 +22,34 @@ import pandas as pd
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import EnvironmentMetadata
-from medusa_env.grader import Grader
-from medusa_env.models import MedusaAction, MedusaActionType, MedusaObservation, MedusaState
-from medusa_env.operators import (
-    apply_scd,
-    deduplicate,
-    evolve_schema,
-    execute_join,
-    prep_keys,
-    sync_check,
-)
-from medusa_env.rewards import RewardEngine
-from medusa_env.scenarios import Scenario, ScenarioGenerator
 # ---------------------------------------------------------------------------
@@ -487,13 +503,21 @@ class MedusaEnv(Environment[MedusaAction, MedusaObservation, MedusaState]):
             "timestamp": time.time(),
         })
         features = _build_features(self._state)
         obs = MedusaObservation(
             message=(
                 f"COMMIT: episode finalized. "
                 f"{'Grader: PASS ✓' if grader_result.passed else 'Grader: FAIL ✗'} "
                 f"Bonus: {grader_result.bonus_reward:+.1f} | "
-                f"Total reward: {self._state.cumulative_reward:.1f}"
             ),
             features=features,
             metrics={
@@ -502,13 +526,15 @@ class MedusaEnv(Environment[MedusaAction, MedusaObservation, MedusaState]):
                 "silver_rows": self._state.silver_row_count,
                 "quarantine_rows": self._state.quarantine_row_count,
                 "governance_log_entries": len(self._tables.governance_log),
             },
             metadata={
                 "run_id": self._state.run_id,
                 "steps": self._state.step_idx,
                 "cumulative_reward": self._state.cumulative_reward,
             },
             reward=reward,
             done=True,
         )
-        return self._apply_transform(obs)

 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import EnvironmentMetadata
+try:
+    from .grader import Grader
+    from .models import MedusaAction, MedusaActionType, MedusaObservation, MedusaState
+    from .operators import (
+        apply_scd,
+        deduplicate,
+        evolve_schema,
+        execute_join,
+        prep_keys,
+        sync_check,
+    )
+    from .rewards import RewardEngine
+    from .scenarios import Scenario, ScenarioGenerator
+    from .tasks import TASKS, score_episode
+except ImportError:
+    from grader import Grader
+    from models import MedusaAction, MedusaActionType, MedusaObservation, MedusaState
+    from operators import (
+        apply_scd,
+        deduplicate,
+        evolve_schema,
+        execute_join,
+        prep_keys,
+        sync_check,
+    )
+    from rewards import RewardEngine
+    from scenarios import Scenario, ScenarioGenerator
+    from tasks import TASKS, score_episode
 # ---------------------------------------------------------------------------
             "timestamp": time.time(),
         })
+        # Map the current episode seed to the task definitions to get the explicit task_id
+        task_id = next((tid for tid, t in TASKS.items() if t.seed == self._state.seed), "clean_pipeline")
+        # Calculate the final [0, 1] evaluation score for this episode
+        final_result = score_episode(task_id, self._state, self._tables)
+        final_score = final_result.score
         features = _build_features(self._state)
         obs = MedusaObservation(
             message=(
                 f"COMMIT: episode finalized. "
                 f"{'Grader: PASS ✓' if grader_result.passed else 'Grader: FAIL ✗'} "
                 f"Bonus: {grader_result.bonus_reward:+.1f} | "
+                f"Total reward: {self._state.cumulative_reward:.1f} | "
+                f"Final Score: {final_score:.3f}"
             ),
             features=features,
             metrics={
                 "silver_rows": self._state.silver_row_count,
                 "quarantine_rows": self._state.quarantine_row_count,
                 "governance_log_entries": len(self._tables.governance_log),
+                "score": final_score,
             },
             metadata={
                 "run_id": self._state.run_id,
                 "steps": self._state.step_idx,
                 "cumulative_reward": self._state.cumulative_reward,
+                "score": final_score,
             },
             reward=reward,
             done=True,
         )
+        return self._apply_transform(obs)

validate.sh ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env bash
+#
+# validate-submission.sh — OpenEnv Submission Validator
+#
+# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
+#
+# Prerequisites:
+#   - Docker:       https://docs.docker.com/get-docker/
+#   - openenv-core: pip install openenv-core
+#   - curl (usually pre-installed)
+#
+# Run:
+#   curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
+#
+#   Or download and run locally:
+#     chmod +x validate-submission.sh
+#     ./validate-submission.sh <ping_url> [repo_dir]
+#
+# Arguments:
+#   ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)
+#   repo_dir   Path to your repo (default: current directory)
+#
+# Examples:
+#   ./validate-submission.sh https://my-team.hf.space
+#   ./validate-submission.sh https://my-team.hf.space ./my-repo
+#
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+export PING_URL
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it: pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Your submission is ready to submit.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0