rampluto commited on
Commit
fe7e1aa
·
verified ·
1 Parent(s): fd09b74

Upload folder using huggingface_hub

Browse files
Files changed (12) hide show
  1. README.md +16 -7
  2. __init__.py +2 -0
  3. client.py +1 -1
  4. inference.py +266 -0
  5. openenv.yaml +64 -0
  6. run_episode.py +88 -0
  7. scenarios.py +5 -1
  8. scripts/env_test.py +20 -0
  9. scripts/hf_test.py +17 -0
  10. server/app.py +1 -2
  11. server/medusa_env.py +40 -14
  12. validate.sh +185 -0
README.md CHANGED
@@ -132,7 +132,7 @@ Random seeds produce blended variants.
132
 
133
  ```bash
134
  # Clone / navigate to repo
135
- cd /path/to/OpenEnv
136
 
137
  # Create venv and install all deps (including pandas, numpy)
138
  uv sync
@@ -148,23 +148,25 @@ source .venv/bin/activate
148
  ### Start the FastAPI server
149
 
150
  ```bash
151
- uvicorn envs.medusa_env.server.app:app --reload --host 0.0.0.0 --port 8000
 
 
152
  ```
153
 
154
  API docs available at `http://localhost:8000/docs`.
155
-
156
  ### Run tests
157
 
158
  ```bash
159
  python -m pytest tests/envs/test_medusa_environment.py -v
160
- # 39 passed in ~4s
161
  ```
162
 
163
  ### Run a manual episode (Python)
164
 
165
  ```python
166
- from envs.medusa_env import MedusaEnv, MedusaAction
167
- from envs.medusa_env.models import MedusaActionType
168
 
169
  env = MedusaEnv(n_fact_rows=200, n_dim_rows=150)
170
  obs = env.reset(seed=0) # seed 0 = clean scenario
@@ -186,6 +188,13 @@ for action_type in [
186
  print(f"\nGrader: {env.state.grader_report}")
187
  ```
188
 
 
 
 
 
 
 
 
189
  ---
190
 
191
  ## Architecture
@@ -193,7 +202,6 @@ print(f"\nGrader: {env.state.grader_report}")
193
  ```
194
  envs/medusa_env/
195
  ├── __init__.py # Package exports
196
- ├── medusa_env.py # MedusaEnv — reset / step / commit loop
197
  ├── models.py # MedusaAction, MedusaObservation, MedusaState (Pydantic)
198
  ├── scenarios.py # ScenarioGenerator — procedural Bronze A/B DataFrames
199
  ├── operators.py # Stateless ETL functions (sync_check, prep_keys, execute_join, apply_scd …)
@@ -202,6 +210,7 @@ envs/medusa_env/
202
  ├── openenv.yaml # OpenEnv environment manifest
203
  └── server/
204
  └── app.py # FastAPI app via create_app()
 
205
 
206
  tests/envs/
207
  └── test_medusa_environment.py # 39 tests across 6 test classes
 
132
 
133
  ```bash
134
  # Clone / navigate to repo
135
+ cd Medusa
136
 
137
  # Create venv and install all deps (including pandas, numpy)
138
  uv sync
 
148
  ### Start the FastAPI server
149
 
150
  ```bash
151
+ openenv validate
152
+ openenv build --tag openenv-medusa
153
+ docker run -p 8000:8000 openenv-medusa:latest
154
  ```
155
 
156
  API docs available at `http://localhost:8000/docs`.
157
+ Playground available at `https://localhost:8000/web`
158
  ### Run tests
159
 
160
  ```bash
161
  python -m pytest tests/envs/test_medusa_environment.py -v
162
+ # 53 passed in ~4s
163
  ```
164
 
165
  ### Run a manual episode (Python)
166
 
167
  ```python
168
+ from medusa_env.server import MedusaEnv
169
+ from medusa_env.models import MedusaActionType, MedusaAction
170
 
171
  env = MedusaEnv(n_fact_rows=200, n_dim_rows=150)
172
  obs = env.reset(seed=0) # seed 0 = clean scenario
 
188
  print(f"\nGrader: {env.state.grader_report}")
189
  ```
190
 
191
+ ### steps to push to hugging face
192
+ ```bash
193
+ openenv push --repo-id <hf_username>/<hf_space>
194
+ ```
195
+
196
+ Huggingface BASE_URL="https://<hf_username>-<hf_space>.hf.space"
197
+
198
  ---
199
 
200
  ## Architecture
 
202
  ```
203
  envs/medusa_env/
204
  ├── __init__.py # Package exports
 
205
  ├── models.py # MedusaAction, MedusaObservation, MedusaState (Pydantic)
206
  ├── scenarios.py # ScenarioGenerator — procedural Bronze A/B DataFrames
207
  ├── operators.py # Stateless ETL functions (sync_check, prep_keys, execute_join, apply_scd …)
 
210
  ├── openenv.yaml # OpenEnv environment manifest
211
  └── server/
212
  └── app.py # FastAPI app via create_app()
213
+ ├── medusa_env.py # MedusaEnv — reset / step / commit loop
214
 
215
  tests/envs/
216
  └── test_medusa_environment.py # 39 tests across 6 test classes
__init__.py CHANGED
@@ -15,9 +15,11 @@ from .models import MedusaAction, MedusaActionType, MedusaObservation, MedusaSta
15
  from .rewards import RewardEngine
16
  from .scenarios import Scenario, ScenarioGenerator
17
  from .tasks import TASKS, Task, TaskResult, score_episode
 
18
 
19
  __all__ = [
20
  "medusa_env",
 
21
  "MedusaAction",
22
  "MedusaActionType",
23
  "MedusaObservation",
 
15
  from .rewards import RewardEngine
16
  from .scenarios import Scenario, ScenarioGenerator
17
  from .tasks import TASKS, Task, TaskResult, score_episode
18
+ from server.medusa_env import MedusaEnv
19
 
20
  __all__ = [
21
  "medusa_env",
22
+ "MedusaEnv"
23
  "MedusaAction",
24
  "MedusaActionType",
25
  "MedusaObservation",
client.py CHANGED
@@ -28,7 +28,7 @@ try:
28
  from openenv.core.client_types import StepResult
29
  from openenv.core.env_client import EnvClient
30
 
31
- from .models import MedusaAction, MedusaObservation, MedusaState
32
  except ImportError:
33
  from models import MedusaAction, MedusaObservation, MedusaState
34
 
 
28
  from openenv.core.client_types import StepResult
29
  from openenv.core.env_client import EnvClient
30
 
31
+ from medusa_env.models import MedusaAction, MedusaObservation, MedusaState
32
  except ImportError:
33
  from models import MedusaAction, MedusaObservation, MedusaState
34
 
inference.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MEDUSA inference script — OpenEnv Hackathon submission.
2
+
3
+ Runs an LLM agent (via OpenAI-compatible API) against all three MEDUSA tasks
4
+ and reports per-task scores (0.0–1.0).
5
+
6
+ Required environment variables:
7
+ API_BASE_URL The API endpoint for the LLM (OpenAI-compatible). Defaults to https://router.huggingface.co/v1
8
+ MODEL_NAME The model identifier to use for inference.
9
+ HF_TOKEN Your Hugging Face / API key (also accepts API_KEY).
10
+
11
+ Usage:
12
+ export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
13
+ export HF_TOKEN="hf-..."
14
+ python inference.py
15
+
16
+ Output:
17
+ Prints per-task results and a final summary table to stdout.
18
+ Exits with code 0 if all tasks score >= 0.35, else 1.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import os
25
+ import sys
26
+ import textwrap
27
+ import time
28
+ from typing import List, Optional
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Validate required environment variables before anything else
32
+ # ---------------------------------------------------------------------------
33
+
34
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
35
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or "mock-key"
36
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
37
+ TASK_NAME = os.getenv("TASK_NAME", "clean_pipeline")
38
+ BENCHMARK = os.getenv("BENCHMARK", "medusa_env")
39
+
40
+ _missing = [k for k, v in {
41
+ "API_BASE_URL": API_BASE_URL,
42
+ "MODEL_NAME": MODEL_NAME,
43
+ "API_KEY (or HF_TOKEN)": API_KEY,
44
+ }.items() if not v]
45
+
46
+ if _missing:
47
+ print(f"ERROR: Missing required environment variables: {', '.join(_missing)}", file=sys.stderr)
48
+ print("Set them before running:", file=sys.stderr)
49
+ for k in _missing:
50
+ print(f" export {k}=<value>", file=sys.stderr)
51
+ sys.exit(1)
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # OpenAI client (uses API_BASE_URL + HF_TOKEN as the key)
55
+ # ---------------------------------------------------------------------------
56
+
57
+ from openai import OpenAI # noqa: E402
58
+
59
+ client = OpenAI(
60
+ base_url=API_BASE_URL,
61
+ api_key=API_KEY,
62
+ )
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # MEDUSA environment imports
66
+ # ---------------------------------------------------------------------------
67
+
68
+ from pathlib import Path
69
+
70
+ # Dynamically add the OpenEnv repo root to sys.path so absolute imports work
71
+ # no matter where this script is executed from.
72
+ repo_root = str(Path(__file__).resolve().parent.parent.parent)
73
+ if repo_root not in sys.path:
74
+ sys.path.insert(0, repo_root)
75
+
76
+ try:
77
+ # In-repo
78
+ from envs.medusa_env import MedusaEnv
79
+ from envs.medusa_env.models import MedusaAction, MedusaActionType
80
+ from envs.medusa_env.tasks import TASKS, TaskResult, score_episode
81
+ except ImportError:
82
+ # Standalone (running from inside envs/medusa_env/ installation)
83
+ from medusa_env import MedusaEnv # type: ignore
84
+ from models import MedusaAction, MedusaActionType # type: ignore
85
+ from tasks import TASKS, TaskResult, score_episode # type: ignore
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # System prompt
89
+ # ---------------------------------------------------------------------------
90
+
91
+ SYSTEM_PROMPT = textwrap.dedent("""
92
+ You are a data integration agent controlling a Bronze→Silver ETL pipeline.
93
+
94
+ You observe a 16-float feature vector describing data quality signals, and
95
+ you must choose one action per step from the list below.
96
+
97
+ ACTIONS (respond with ONLY the action name — nothing else):
98
+ SYNC_CHECK — Verify source freshness before processing
99
+ EVOLVE_SCHEMA — Add new columns from sources into Silver schema
100
+ PREP_KEYS_A — Clean and normalise join keys in Source A (Fact)
101
+ PREP_KEYS_B — Clean and normalise join keys in Source B (Dimension)
102
+ DEDUPLICATE_B — Remove duplicate keys from Source B
103
+ EXECUTE_JOIN_INNER — Inner join A ⋈ B
104
+ EXECUTE_JOIN_LEFT — Left join A ⋈ B (keeps all Fact rows; orphans → quarantine)
105
+ EXECUTE_JOIN_ANTI — Anti-join: extract Fact rows with no Dimension match
106
+ APPLY_SCD_1 — Overwrite Silver records (SCD Type 1)
107
+ APPLY_SCD_2 — Close old records and insert new with timestamps (SCD Type 2)
108
+ COMMIT — Finalise pipeline and trigger audit
109
+
110
+ STRATEGY:
111
+ 1. Always call SYNC_CHECK first to verify freshness.
112
+ 2. If schema drift signals are non-zero (features[9] or [10] > 0), call EVOLVE_SCHEMA.
113
+ 3. If null key ratios (features[4] or [5] > 0), call PREP_KEYS_A and/or PREP_KEYS_B.
114
+ 4. If Dimension uniqueness (features[7]) < 1.0, call DEDUPLICATE_B.
115
+ 5. Prefer EXECUTE_JOIN_LEFT to preserve all Fact rows.
116
+ 6. Prefer APPLY_SCD_2 for tracked history.
117
+ 7. Call COMMIT when pipeline is complete.
118
+
119
+ The feature vector indices:
120
+ [0] time_delta_a_norm [1] time_delta_b_norm
121
+ [2] is_stale_a [3] is_stale_b
122
+ [4] null_ratio_key_a [5] null_ratio_key_b
123
+ [6] uniqueness_a [7] uniqueness_b
124
+ [8] match_rate [9] new_cols_a_norm
125
+ [10] new_cols_b_norm [11] schema_compat
126
+ [12] did_prep_a [13] did_prep_b
127
+ [14] did_dedup_b [15] step_frac
128
+ """).strip()
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # LLM action chooser
132
+ # ---------------------------------------------------------------------------
133
+
134
+ VALID_ACTIONS = {a.value for a in MedusaActionType}
135
+
136
+
137
+ def choose_action(
138
+ features: List[float],
139
+ history: List[dict],
140
+ step: int,
141
+ ) -> str:
142
+ """Ask the LLM to choose the next action given the current observation."""
143
+ feature_str = ", ".join(f"{v:.3f}" for v in features)
144
+ user_msg = (
145
+ f"Step {step}. Feature vector: [{feature_str}]\n"
146
+ "What is the single best next action? Respond with ONLY the action name."
147
+ )
148
+
149
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
150
+ # Include the last 4 steps of history for context (keep prompt short)
151
+ for h in history[-4:]:
152
+ messages.append({"role": "user", "content": h["user"]})
153
+ messages.append({"role": "assistant", "content": h["assistant"]})
154
+ messages.append({"role": "user", "content": user_msg})
155
+
156
+ response = client.chat.completions.create(
157
+ model=MODEL_NAME,
158
+ messages=messages,
159
+ # max_tokens=20,
160
+ max_completion_tokens=256,
161
+ temperature=0.1,
162
+ )
163
+ raw = response.choices[0].message.content.strip().upper().replace(" ", "_")
164
+
165
+ # Fuzzy match: accept if the response contains a valid action name
166
+ for action in VALID_ACTIONS:
167
+ if action in raw:
168
+ return action
169
+
170
+ # Fallback: extract the longest matching token
171
+ for action in sorted(VALID_ACTIONS, key=len, reverse=True):
172
+ if action.replace("_", "") in raw.replace("_", ""):
173
+ return action
174
+
175
+ # Hard fallback: commit to end gracefully
176
+ return MedusaActionType.COMMIT.value
177
+
178
+
179
+ # ---------------------------------------------------------------------------
180
+ # Logging Functions (Hackathon STDOut Format)
181
+ # ---------------------------------------------------------------------------
182
+
183
+ def log_start(task: str, env: str, model: str) -> None:
184
+ print(f"[START] task={task} env={env} model={model}", flush=True)
185
+
186
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
187
+ error_val = error if error else "null"
188
+ done_val = str(done).lower()
189
+ print(
190
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
191
+ flush=True,
192
+ )
193
+
194
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
195
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
196
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
197
+
198
+
199
+ # ---------------------------------------------------------------------------
200
+ # Run one task
201
+ # ---------------------------------------------------------------------------
202
+
203
+ def run_task(task_id: str, max_steps: int = 15) -> None:
204
+ """Run the LLM agent for one MEDUSA task using required hackathon STDOUT format."""
205
+ log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
206
+
207
+ task = TASKS[task_id]
208
+ env = MedusaEnv(n_fact_rows=200, n_dim_rows=150, max_steps=max_steps)
209
+ obs = env.reset(seed=task.seed)
210
+
211
+ history: List[dict] = []
212
+ rewards_list: List[float] = []
213
+ step = 0
214
+ success = False
215
+ score = 0.0
216
+
217
+ try:
218
+ while not obs.done and step < max_steps:
219
+ step += 1
220
+ action_str = choose_action(obs.features, history, step)
221
+
222
+ # Since the environment throws errors on bad actions, we just pass the action string.
223
+ try:
224
+ action_type = MedusaActionType(action_str)
225
+ except ValueError:
226
+ action_type = MedusaActionType.COMMIT # default fallback
227
+
228
+ action = MedusaAction(action=action_type)
229
+
230
+ obs = env.step(action)
231
+ reward = obs.reward or 0.0
232
+ rewards_list.append(reward)
233
+
234
+ log_step(step=step, action=action_str, reward=reward, done=obs.done, error=None)
235
+
236
+ history.append({
237
+ "user": (f"Step {step}. Features: [{', '.join(f'{v:.3f}' for v in obs.features)}]"
238
+ " What action?"),
239
+ "assistant": action_str,
240
+ })
241
+
242
+ if obs.done:
243
+ break
244
+
245
+ # Tally final score via grader
246
+ result = score_episode(task_id, env.state, env._tables)
247
+ score = result.score
248
+ success = result.passed
249
+
250
+ except Exception as e:
251
+ log_step(step=step+1 if step > 0 else 1, action="ERROR", reward=0.0, done=True, error=str(e))
252
+ finally:
253
+ log_end(success=success, steps=step, score=score, rewards=rewards_list)
254
+
255
+
256
+ # ---------------------------------------------------------------------------
257
+ # Main
258
+ # ---------------------------------------------------------------------------
259
+
260
+ def main() -> None:
261
+ # Do not loop task variants anymore; run dynamically via TASK_NAME
262
+ run_task(TASK_NAME)
263
+
264
+ if __name__ == "__main__":
265
+ main()
266
+
openenv.yaml CHANGED
@@ -4,3 +4,67 @@ type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
+ tasks:
8
+ - id: clean_pipeline
9
+ name: Clean Pipeline
10
+ difficulty: easy
11
+ seed: 0
12
+ description: >
13
+ Both sources are fresh. Join keys are clean and unique. The agent must
14
+ verify freshness, prepare keys, join, apply SCD, and commit without
15
+ triggering a row explosi
16
+ success_criteria:
17
+ - COMMIT issued (episode finalized)
18
+ - No Cartesian explosion detected
19
+ - Silver row count <= Source A row count
20
+ - match_rate > 0.80 after join
21
+ scoring_rubric:
22
+ committed: 0.20
23
+ no_explosion: 0.25
24
+ volume_ok: 0.20
25
+ high_match: 0.20
26
+ grader_pass: 0.15
27
+ - id: dirty_integration
28
+ name: Dirty Key Integration
29
+ difficulty: medium
30
+ seed: 1
31
+ description: >
32
+ Source A has NULLs and whitespace in join keys. Source B has duplicate
33
+ keys that can cause row explosion. The agent must PREP_KEYS and
34
+ DEDUPLICATE before joining, and correctly quarantine unresolvable
35
+ orphans.
36
+ success_criteria:
37
+ - PREP_KEYS_A issued before EXECUTE_JOIN
38
+ - PREP_KEYS_B issued before EXECUTE_JOIN
39
+ - DEDUPLICATE_B issued before EXECUTE_JOIN
40
+ - No row explosion
41
+ - Quarantine integrity check passes
42
+ scoring_rubric:
43
+ committed: 0.10
44
+ prepped_before_join: 0.20
45
+ deduped_before_join: 0.20
46
+ no_explosion: 0.25
47
+ integrity_ok: 0.15
48
+ grader_pass: 0.10
49
+ - id: full_medallion
50
+ name: Full Medallion Integration
51
+ difficulty: hard
52
+ seed: 2
53
+ description: >
54
+ Source A is stale (>6h old). Source B has new schema columns not
55
+ registered in Silver. The agent must check freshness, evolve the schema,
56
+ clean keys, deduplicate, execute a left join, apply SCD-2 for tracked
57
+ columns, and pass all grader checks.
58
+ success_criteria:
59
+ - SYNC_CHECK issued before any join
60
+ - EVOLVE_SCHEMA issued before COMMIT
61
+ - SCD-2 applied (not SCD-1) for tracked column
62
+ - Silver schema contains new columns from drift
63
+ - All 4 grader checks pass
64
+ scoring_rubric:
65
+ committed: 0.05
66
+ sync_checked: 0.15
67
+ schema_evolved: 0.15
68
+ used_scd2: 0.20
69
+ schema_ok: 0.20
70
+ grader_pass: 0.25
run_episode.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ # Support both installed package usage (medusa_env) and in-repo local modules
4
+ try:
5
+ from medusa_env import (
6
+ medusa_env,
7
+ MedusaAction,
8
+ MedusaActionType,
9
+ MedusaObservation,
10
+ )
11
+ except ImportError:
12
+ # Fallback to local modules when running from the repo root without installing
13
+ from client import medusa_env
14
+ from models import MedusaAction, MedusaActionType, MedusaObservation
15
+
16
+ MEDUSA_URL = 'https://anubhavkamal-medusa-env.hf.space'
17
+ # MEDUSA_URL = 'http://localhost:8000'
18
+
19
+
20
+ class RandomPolicy:
21
+ """Pure random — baseline for MEDUSA."""
22
+ name = "Random"
23
+
24
+ def select_action(self, obs: MedusaObservation) -> MedusaActionType:
25
+ # Pick randomly from the 11 valid operators
26
+ return random.choice(list(MedusaActionType))
27
+
28
+
29
+ class AlwaysCommitPolicy:
30
+ """Immediately terminates the episode by committing."""
31
+ name = "Always Commit"
32
+
33
+ def select_action(self, obs: MedusaObservation) -> MedusaActionType:
34
+ return MedusaActionType.COMMIT
35
+
36
+
37
+ class CleanPipelinePolicy:
38
+ """Hardcoded sequence to perfectly solve the Easy (Clean Pipeline) task."""
39
+ name = "Clean Pipeline Heuristic"
40
+
41
+ def __init__(self):
42
+ # The correct sequence of operations for the clean pipeline scenario
43
+ self.sequence = [
44
+ MedusaActionType.SYNC_CHECK,
45
+ MedusaActionType.PREP_KEYS_A,
46
+ MedusaActionType.PREP_KEYS_B,
47
+ MedusaActionType.EXECUTE_JOIN_LEFT,
48
+ MedusaActionType.APPLY_SCD_2,
49
+ MedusaActionType.COMMIT
50
+ ]
51
+ self.step = 0
52
+
53
+ def select_action(self, obs: MedusaObservation) -> MedusaActionType:
54
+ if self.step < len(self.sequence):
55
+ action = self.sequence[self.step]
56
+ self.step += 1
57
+ return action
58
+ return MedusaActionType.COMMIT
59
+
60
+
61
+ print("Policies defined: Random, Always Commit, Clean Pipeline Heuristic")
62
+
63
+
64
+ def run_episode(env, policy, seed=0, verbose=False):
65
+ """Play one episode. Returns the final reward (-1.0 to 1.0)."""
66
+ result = env.reset(seed=seed)
67
+ step = 0
68
+
69
+ while not result.done:
70
+ action_type = policy.select_action(result.observation)
71
+ if verbose:
72
+ print(f' Step {step}: {action_type.value}')
73
+
74
+ result = env.step(MedusaAction(action=action_type))
75
+ step += 1
76
+
77
+ if verbose:
78
+ print(f' Result: Done (reward={result.reward})')
79
+ print(f' Terminal Message: {result.observation.message}')
80
+ if result.observation.metrics:
81
+ print(f' Final Grade: {result.observation.metrics.get("grader_report")}')
82
+ return result.reward
83
+
84
+
85
+ # Demo: one verbose episode with CleanPipelinePolicy
86
+ with medusa_env(base_url=MEDUSA_URL).sync() as env:
87
+ print('\nTesting Clean Pipeline Policy — single episode (seed=0):')
88
+ run_episode(env, CleanPipelinePolicy(), seed=0, verbose=True)
scenarios.py CHANGED
@@ -85,7 +85,10 @@ def _make_dim(
85
  if match_keys:
86
  # Choose from overlap pool to control referential integrity
87
  available = list(match_keys)
88
- keys = [rng.choice(available) for _ in range(n_rows)]
 
 
 
89
  else:
90
  keys = [f"K{i:04d}" for i in rng.sample(range(1, n_rows * 3), n_rows)]
91
 
@@ -213,3 +216,4 @@ class ScenarioGenerator:
213
  new_cols_a=extra_a, new_cols_b=extra_b,
214
  description="Schema drift: new columns in A and B.",
215
  )
 
 
85
  if match_keys:
86
  # Choose from overlap pool to control referential integrity
87
  available = list(match_keys)
88
+ if len(available) >= n_rows:
89
+ keys = rng.sample(available, n_rows)
90
+ else:
91
+ keys = [rng.choice(available) for _ in range(n_rows)]
92
  else:
93
  keys = [f"K{i:04d}" for i in rng.sample(range(1, n_rows * 3), n_rows)]
94
 
 
216
  new_cols_a=extra_a, new_cols_b=extra_b,
217
  description="Schema drift: new columns in A and B.",
218
  )
219
+
scripts/env_test.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from medusa_env import MedusaActionType, MedusaAction, MedusaEnv
2
+
3
+ env = MedusaEnv(n_fact_rows=200, n_dim_rows=150)
4
+ obs = env.reset(seed=0) # seed 0 = clean scenario
5
+ print(obs.message)
6
+
7
+ for action_type in [
8
+ MedusaActionType.SYNC_CHECK,
9
+ MedusaActionType.EVOLVE_SCHEMA,
10
+ MedusaActionType.PREP_KEYS_A,
11
+ MedusaActionType.PREP_KEYS_B,
12
+ MedusaActionType.DEDUPLICATE_B,
13
+ MedusaActionType.EXECUTE_JOIN_LEFT,
14
+ MedusaActionType.APPLY_SCD_2,
15
+ MedusaActionType.COMMIT,
16
+ ]:
17
+ obs = env.step(MedusaAction(action=action_type))
18
+ print(f"{action_type.value:25s} reward={obs.reward:+.1f} done={obs.done}")
19
+
20
+ print(f"\nGrader: {env.state.grader_report}")
scripts/hf_test.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+
4
+ HF_TOKEN = os.environ["HF_TOKEN"]
5
+ BASE_URL="https://rampluto-medusa-env.hf.space"
6
+
7
+ session = requests.Session()
8
+ session.headers.update({
9
+ "Authorization": f"Bearer {HF_TOKEN}",
10
+ "Content-Type": "application/json",
11
+ })
12
+
13
+ session.verify = False
14
+
15
+
16
+ r = session.post(f"{BASE_URL}/reset", timeout=30)
17
+ print("reset:", r.status_code, r.text)
server/app.py CHANGED
@@ -15,8 +15,7 @@ from __future__ import annotations
15
  # 2. Standalone installed (uv run server): medusa_env.* package
16
  # 3. Direct execution inside env dir: bare module names
17
  from openenv.core.env_server.http_server import create_app
18
- from medusa_env.server import MedusaEnv
19
- from medusa_env.models import MedusaAction, MedusaObservation
20
 
21
  app = create_app(
22
  MedusaEnv,
 
15
  # 2. Standalone installed (uv run server): medusa_env.* package
16
  # 3. Direct execution inside env dir: bare module names
17
  from openenv.core.env_server.http_server import create_app
18
+ from medusa_env import MedusaEnv, MedusaAction, MedusaObservation
 
19
 
20
  app = create_app(
21
  MedusaEnv,
server/medusa_env.py CHANGED
@@ -22,18 +22,34 @@ import pandas as pd
22
  from openenv.core.env_server.interfaces import Environment
23
  from openenv.core.env_server.types import EnvironmentMetadata
24
 
25
- from medusa_env.grader import Grader
26
- from medusa_env.models import MedusaAction, MedusaActionType, MedusaObservation, MedusaState
27
- from medusa_env.operators import (
28
- apply_scd,
29
- deduplicate,
30
- evolve_schema,
31
- execute_join,
32
- prep_keys,
33
- sync_check,
34
- )
35
- from medusa_env.rewards import RewardEngine
36
- from medusa_env.scenarios import Scenario, ScenarioGenerator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
  # ---------------------------------------------------------------------------
@@ -487,13 +503,21 @@ class MedusaEnv(Environment[MedusaAction, MedusaObservation, MedusaState]):
487
  "timestamp": time.time(),
488
  })
489
 
 
 
 
 
 
 
 
490
  features = _build_features(self._state)
491
  obs = MedusaObservation(
492
  message=(
493
  f"COMMIT: episode finalized. "
494
  f"{'Grader: PASS ✓' if grader_result.passed else 'Grader: FAIL ✗'} "
495
  f"Bonus: {grader_result.bonus_reward:+.1f} | "
496
- f"Total reward: {self._state.cumulative_reward:.1f}"
 
497
  ),
498
  features=features,
499
  metrics={
@@ -502,13 +526,15 @@ class MedusaEnv(Environment[MedusaAction, MedusaObservation, MedusaState]):
502
  "silver_rows": self._state.silver_row_count,
503
  "quarantine_rows": self._state.quarantine_row_count,
504
  "governance_log_entries": len(self._tables.governance_log),
 
505
  },
506
  metadata={
507
  "run_id": self._state.run_id,
508
  "steps": self._state.step_idx,
509
  "cumulative_reward": self._state.cumulative_reward,
 
510
  },
511
  reward=reward,
512
  done=True,
513
  )
514
- return self._apply_transform(obs)
 
22
  from openenv.core.env_server.interfaces import Environment
23
  from openenv.core.env_server.types import EnvironmentMetadata
24
 
25
+ try:
26
+ from .grader import Grader
27
+ from .models import MedusaAction, MedusaActionType, MedusaObservation, MedusaState
28
+ from .operators import (
29
+ apply_scd,
30
+ deduplicate,
31
+ evolve_schema,
32
+ execute_join,
33
+ prep_keys,
34
+ sync_check,
35
+ )
36
+ from .rewards import RewardEngine
37
+ from .scenarios import Scenario, ScenarioGenerator
38
+ from .tasks import TASKS, score_episode
39
+ except ImportError:
40
+ from grader import Grader
41
+ from models import MedusaAction, MedusaActionType, MedusaObservation, MedusaState
42
+ from operators import (
43
+ apply_scd,
44
+ deduplicate,
45
+ evolve_schema,
46
+ execute_join,
47
+ prep_keys,
48
+ sync_check,
49
+ )
50
+ from rewards import RewardEngine
51
+ from scenarios import Scenario, ScenarioGenerator
52
+ from tasks import TASKS, score_episode
53
 
54
 
55
  # ---------------------------------------------------------------------------
 
503
  "timestamp": time.time(),
504
  })
505
 
506
+ # Map the current episode seed to the task definitions to get the explicit task_id
507
+ task_id = next((tid for tid, t in TASKS.items() if t.seed == self._state.seed), "clean_pipeline")
508
+
509
+ # Calculate the final [0, 1] evaluation score for this episode
510
+ final_result = score_episode(task_id, self._state, self._tables)
511
+ final_score = final_result.score
512
+
513
  features = _build_features(self._state)
514
  obs = MedusaObservation(
515
  message=(
516
  f"COMMIT: episode finalized. "
517
  f"{'Grader: PASS ✓' if grader_result.passed else 'Grader: FAIL ✗'} "
518
  f"Bonus: {grader_result.bonus_reward:+.1f} | "
519
+ f"Total reward: {self._state.cumulative_reward:.1f} | "
520
+ f"Final Score: {final_score:.3f}"
521
  ),
522
  features=features,
523
  metrics={
 
526
  "silver_rows": self._state.silver_row_count,
527
  "quarantine_rows": self._state.quarantine_row_count,
528
  "governance_log_entries": len(self._tables.governance_log),
529
+ "score": final_score,
530
  },
531
  metadata={
532
  "run_id": self._state.run_id,
533
  "steps": self._state.step_idx,
534
  "cumulative_reward": self._state.cumulative_reward,
535
+ "score": final_score,
536
  },
537
  reward=reward,
538
  done=True,
539
  )
540
+ return self._apply_transform(obs)
validate.sh ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Prerequisites:
8
+ # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core
10
+ # - curl (usually pre-installed)
11
+ #
12
+ # Run:
13
+ # curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
14
+ #
15
+ # Or download and run locally:
16
+ # chmod +x validate-submission.sh
17
+ # ./validate-submission.sh <ping_url> [repo_dir]
18
+ #
19
+ # Arguments:
20
+ # ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
21
+ # repo_dir Path to your repo (default: current directory)
22
+ #
23
+ # Examples:
24
+ # ./validate-submission.sh https://my-team.hf.space
25
+ # ./validate-submission.sh https://my-team.hf.space ./my-repo
26
+ #
27
+
28
+ set -uo pipefail
29
+
30
+ DOCKER_BUILD_TIMEOUT=600
31
+ if [ -t 1 ]; then
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ BOLD='\033[1m'
36
+ NC='\033[0m'
37
+ else
38
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
39
+ fi
40
+
41
+ run_with_timeout() {
42
+ local secs="$1"; shift
43
+ if command -v timeout &>/dev/null; then
44
+ timeout "$secs" "$@"
45
+ elif command -v gtimeout &>/dev/null; then
46
+ gtimeout "$secs" "$@"
47
+ else
48
+ "$@" &
49
+ local pid=$!
50
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
51
+ local watcher=$!
52
+ wait "$pid" 2>/dev/null
53
+ local rc=$?
54
+ kill "$watcher" 2>/dev/null
55
+ wait "$watcher" 2>/dev/null
56
+ return $rc
57
+ fi
58
+ }
59
+
60
+ portable_mktemp() {
61
+ local prefix="${1:-validate}"
62
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
+ }
64
+
65
+ CLEANUP_FILES=()
66
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
+ trap cleanup EXIT
68
+
69
+ PING_URL="${1:-}"
70
+ REPO_DIR="${2:-.}"
71
+
72
+ if [ -z "$PING_URL" ]; then
73
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
74
+ printf "\n"
75
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
76
+ printf " repo_dir Path to your repo (default: current directory)\n"
77
+ exit 1
78
+ fi
79
+
80
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
81
+ printf "Error: directory '%s' not found\n" "${2:-.}"
82
+ exit 1
83
+ fi
84
+ PING_URL="${PING_URL%/}"
85
+ export PING_URL
86
+ PASS=0
87
+
88
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
89
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
90
+ fail() { log "${RED}FAILED${NC} -- $1"; }
91
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
92
+ stop_at() {
93
+ printf "\n"
94
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
95
+ exit 1
96
+ }
97
+
98
+ printf "\n"
99
+ printf "${BOLD}========================================${NC}\n"
100
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
101
+ printf "${BOLD}========================================${NC}\n"
102
+ log "Repo: $REPO_DIR"
103
+ log "Ping URL: $PING_URL"
104
+ printf "\n"
105
+
106
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
107
+
108
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
109
+ CLEANUP_FILES+=("$CURL_OUTPUT")
110
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
111
+ -H "Content-Type: application/json" -d '{}' \
112
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
113
+
114
+ if [ "$HTTP_CODE" = "200" ]; then
115
+ pass "HF Space is live and responds to /reset"
116
+ elif [ "$HTTP_CODE" = "000" ]; then
117
+ fail "HF Space not reachable (connection failed or timed out)"
118
+ hint "Check your network connection and that the Space is running."
119
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
120
+ stop_at "Step 1"
121
+ else
122
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
123
+ hint "Make sure your Space is running and the URL is correct."
124
+ hint "Try opening $PING_URL in your browser first."
125
+ stop_at "Step 1"
126
+ fi
127
+
128
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
129
+
130
+ if ! command -v docker &>/dev/null; then
131
+ fail "docker command not found"
132
+ hint "Install Docker: https://docs.docker.com/get-docker/"
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
137
+ DOCKER_CONTEXT="$REPO_DIR"
138
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
+ DOCKER_CONTEXT="$REPO_DIR/server"
140
+ else
141
+ fail "No Dockerfile found in repo root or server/ directory"
142
+ stop_at "Step 2"
143
+ fi
144
+
145
+ log " Found Dockerfile in $DOCKER_CONTEXT"
146
+
147
+ BUILD_OK=false
148
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
+
150
+ if [ "$BUILD_OK" = true ]; then
151
+ pass "Docker build succeeded"
152
+ else
153
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
154
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
155
+ stop_at "Step 2"
156
+ fi
157
+
158
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
+
160
+ if ! command -v openenv &>/dev/null; then
161
+ fail "openenv command not found"
162
+ hint "Install it: pip install openenv-core"
163
+ stop_at "Step 3"
164
+ fi
165
+
166
+ VALIDATE_OK=false
167
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
+
169
+ if [ "$VALIDATE_OK" = true ]; then
170
+ pass "openenv validate passed"
171
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
172
+ else
173
+ fail "openenv validate failed"
174
+ printf "%s\n" "$VALIDATE_OUTPUT"
175
+ stop_at "Step 3"
176
+ fi
177
+
178
+ printf "\n"
179
+ printf "${BOLD}========================================${NC}\n"
180
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
181
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
182
+ printf "${BOLD}========================================${NC}\n"
183
+ printf "\n"
184
+
185
+ exit 0