Spaces:

KnightBlade
/

data-wrangler-env

Runtime error

App Files Files Community

KnightBlade commited on 19 days ago

Commit

29473f6

1 Parent(s): 8d27c3e

feat: Priority 2-4 implementations

Browse files

Files changed (4) hide show

.github/workflows/ci.yml +50 -0
inference.py +30 -21
server/data_wrangler_environment.py +26 -11
tests/test_env.py +39 -0

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,50 @@

+name: Data Wrangler CI/CD
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r server/requirements.txt
+          pip install pytest openenv
+      - name: Run Tests
+        run: |
+          pytest tests/ -v
+  deploy_hf_space:
+    needs: test
+    if: github.ref == 'refs/heads/main'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Push to Hugging Face
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "github-actions[bot]"
+          git remote add hf https://user:$HF_TOKEN@huggingface.co/spaces/KnightBlade/data_wrangler
+          git push -f hf main

inference.py CHANGED Viewed

@@ -58,29 +58,38 @@ Select Action: Which action type and parameters will execute this fix?
 }
 """
-async def get_model_message(client, step, obs_dict, last_reward, history):
     obs_text = str(obs_dict)
     prompt = f"Step {step}.\nObservation: {obs_text}\nLast Reward: {last_reward}\nHistory: {history}\nChoose your next action (JSON matching schema)."
-    try:
-        response = await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt}
-            ],
-            temperature=0.0
-        )
-        content = response.choices[0].message.content
-        import json
-        import re
-        # Basic parsing of the JSON structure that follows the thinking tags
-        match = re.search(r'(\{.*\})', content, re.DOTALL)
-        if match:
-            return json.loads(match.group(1))
-        # Fallback if unparseable
-        return {"action_type": "submit"}
-    except Exception as e:
-        return {"action_type": "submit"}
 def log_start(task, env, model):
     print(f"[START] task={task} env={env} model={model}")

 }
 """
+async def get_model_message(client, step, obs_dict, last_reward, history, max_retries=3):
     obs_text = str(obs_dict)
     prompt = f"Step {step}.\nObservation: {obs_text}\nLast Reward: {last_reward}\nHistory: {history}\nChoose your next action (JSON matching schema)."
+    # Priority 3: Error Reflection. Pass previous feedback directly to LLM if there was an error.
+    if "Error" in obs_dict.get("last_action_feedback", "") or "Exception" in obs_dict.get("last_action_feedback", ""):
+        prompt += f"\nCRITICAL: Your last action failed with this error: {obs_dict['last_action_feedback']}. Review your <thinking> block to correct your mistake before trying a new action."
+    for attempt in range(max_retries):
+        try:
+            response = await client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.0
+            )
+            content = response.choices[0].message.content
+            import json
+            import re
+            match = re.search(r'(\{.*\})', content, re.DOTALL)
+            if match:
+                return json.loads(match.group(1))
+            else:
+                prompt += f"\nWarning: Failed to extract JSON on attempt {attempt+1}. Provide ONLY valid JSON inside curly braces."
+        except Exception as e:
+            prompt += f"\nWarning: Exception on attempt {attempt+1}: {str(e)}. Provide valid JSON."
+    # Fallback only if absolutely all retries fail
+    return {"action_type": "submit"}
 def log_start(task, env, model):
     print(f"[START] task={task} env={env} model={model}")

server/data_wrangler_environment.py CHANGED Viewed

@@ -157,21 +157,36 @@ class DataWranglerEnvironment(Environment):
     def _grade(self) -> float:
         score = 0.0
-        if list(self.df.columns) == list(self.target_df.columns):
-            score += 0.5
-             # Match types and values
-            value_matches = 0
-            for col in self.df.columns:
                 try:
-                    # simple match check
-                    match = (self.df[col] == self.target_df[col]).all()
-                    if match:
-                        value_matches += 1
                 except:
                     pass
-            score += 0.5 * (value_matches / max(len(self.target_df.columns), 1))
-        return score
     @property
     def state(self) -> State:

     def _grade(self) -> float:
         score = 0.0
+        # Priority 2: Partial credit per correct column (name + dtype + values)
+        correct_columns = 0
+        target_cols = set(self.target_df.columns)
+        current_cols = set(self.df.columns)
+        for col in target_cols:
+            if col in current_cols:
                 try:
+                    # Check dtype match
+                    if self.df[col].dtype == self.target_df[col].dtype:
+                        # Check value match
+                        if (self.df[col].equals(self.target_df[col])):
+                            correct_columns += 1
                 except:
                     pass
+        # Max score from matching columns is 0.8 (leaving 0.2 for efficiency)
+        column_score = (correct_columns / max(len(target_cols), 1)) * 0.8
+        score += column_score
+        # Priority 2: Step efficiency bonus
+        # If solved in few steps, give up to 0.2 bonus
+        ideal_steps = len(target_cols) # rough estimate
+        if self._state.step_count <= ideal_steps + 2:
+            score += 0.2
+        elif self._state.step_count <= ideal_steps + 5:
+            score += 0.1
+        return min(max(score, 0.0), 1.0)
     @property
     def state(self) -> State:

tests/test_env.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import pytest
+from server.data_wrangler_environment import DataWranglerEnvironment
+from models import DataWranglerAction
+def test_environment_reset():
+    env = DataWranglerEnvironment()
+    obs = env.reset()
+    assert obs.columns == ["User Name", "Unnamed: 0", "Age"]
+    assert obs.row_count == 3
+    assert not obs.is_done
+def test_drop_action_scoring():
+    env = DataWranglerEnvironment()
+    env.reset()
+    # It should penalize dropping User Name
+    action = DataWranglerAction(action_type="drop_column", target_column="User Name")
+    obs = env.step(action)
+    assert "User Name" not in obs.columns
+    assert "Warning" in obs.last_action_feedback or "Error" in obs.last_action_feedback
+def test_successful_grading():
+    import os
+    os.environ["TASK_LEVEL"] = "1"
+    env = DataWranglerEnvironment()
+    env.reset()
+    # 1. Drop Unnamed: 0
+    env.step(DataWranglerAction(action_type="drop_column", target_column="Unnamed: 0"))
+    # 2. Rename User Name
+    env.step(DataWranglerAction(action_type="rename_column", target_column="User Name", new_name="user_name"))
+    # 3. Rename Age
+    env.step(DataWranglerAction(action_type="rename_column", target_column="Age", new_name="age"))
+    # 4. Submit
+    obs = env.step(DataWranglerAction(action_type="submit"))
+    assert obs.is_done
+    assert obs.reward > 0.8  # partial credit + efficiency