Spaces:

iitian
/

open_env

Sleeping

App Files Files Community

iitian commited on 13 days ago

Commit

47ab3b8

1 Parent(s): 7b50b8a

fix: update grader scores to fall strictly within (0, 1)

Browse files

Files changed (3) hide show

inference.py +6 -6
server/environment.py +8 -8
server/tasks.py +3 -3

inference.py CHANGED Viewed

@@ -191,7 +191,7 @@ def run_task(task: dict):
         obs = reset_data.get("observation", {})
         info = obs.get("info", "")
     except Exception as e:
-        log_end(success=False, total_steps=0, score=0.0, rewards=[])
         return
     conversation = [
@@ -209,7 +209,7 @@ def run_task(task: dict):
             action = ask_llm(system_prompt, conversation)
         except Exception as e:
             last_error = f"LLM error: {str(e)}"
-            log_step(step_num, {"error": "LLM failed"}, 0.0, True, error=last_error)
             break
         # Execute the action in the environment
@@ -221,14 +221,14 @@ def run_task(task: dict):
             last_error = obs.get("last_action_error")
         except Exception as e:
             last_error = f"Env error: {str(e)}"
-            log_step(step_num, action, 0.0, True, error=last_error)
             break
         rewards.append(reward)
         log_step(step_num, action, reward, done, error=last_error)
         if done:
-            success = (reward >= 1.0)  # Assume 1.0 is full success
             break
         # Build observation summary for the LLM
@@ -250,8 +250,8 @@ def run_task(task: dict):
         conversation.append({"role": "assistant", "content": json.dumps(action)})
         conversation.append({"role": "user", "content": f"Observation from environment:\n{obs_text}\n\nDecide your next action."})
-    # Calculate final score (normalized to [0, 1])
-    final_score = max(0.0, min(1.0, sum(rewards)))
     log_end(success=success, total_steps=step_num, score=final_score, rewards=rewards)

         obs = reset_data.get("observation", {})
         info = obs.get("info", "")
     except Exception as e:
+        log_end(success=False, total_steps=0, score=0.01, rewards=[])
         return
     conversation = [
             action = ask_llm(system_prompt, conversation)
         except Exception as e:
             last_error = f"LLM error: {str(e)}"
+            log_step(step_num, {"error": "LLM failed"}, 0.01, True, error=last_error)
             break
         # Execute the action in the environment
             last_error = obs.get("last_action_error")
         except Exception as e:
             last_error = f"Env error: {str(e)}"
+            log_step(step_num, action, 0.01, True, error=last_error)
             break
         rewards.append(reward)
         log_step(step_num, action, reward, done, error=last_error)
         if done:
+            success = (reward >= 0.8)  # Assume 0.8+ is full success (max is 0.85)
             break
         # Build observation summary for the LLM
         conversation.append({"role": "assistant", "content": json.dumps(action)})
         conversation.append({"role": "user", "content": f"Observation from environment:\n{obs_text}\n\nDecide your next action."})
+    # Calculate final score (normalized to (0, 1) to satisfy validator)
+    final_score = max(0.01, min(0.99, sum(rewards)))
     log_end(success=success, total_steps=step_num, score=final_score, rewards=rewards)

server/environment.py CHANGED Viewed

@@ -12,7 +12,7 @@ class CloudAuditEnv:
         self.episode_id = str(uuid.uuid4())
         self.step_count = 0
         self.is_completed = False
-        self.score = 0.0
         # Mock Infrastructure
         self.resources = {
@@ -40,13 +40,13 @@ class CloudAuditEnv:
         """Required by openenv-core 0.1.1: takes task_id, returns JUST the observation."""
         self.task_id = task_id
         self._initialize_state()
-        return CloudObservation(info=f"Environment reset. Task: {self.task_id}", reward=0.0, done=False)
     def step(self, action: CloudAction) -> CloudObservation:
         """Required by openenv-core 0.1.1: takes action, returns JUST the observation with reward/done fields."""
         try:
             self.step_count += 1
-            reward = 0.0
             terminated = False
             truncated = self.step_count >= 20  # Limit steps
@@ -86,7 +86,7 @@ class CloudAuditEnv:
                     rules = self.resources["ec2"][0]["security_groups"][0]["rules"]
                     has_rdp = any(r["port"] == 3389 and r["cidr"] == "0.0.0.0/0" for r in rules)
                     if not has_rdp:
-                        reward = 1.0
                         terminated = True
                         obs.info = "Success! Port 3389 removed. Task completed."
                     else:
@@ -112,7 +112,7 @@ class CloudAuditEnv:
                         answers = [a.strip() for a in action.answer.split(",")]
                         expected = ["prod-data-001"]
                         if set(answers) == set(expected):
-                            reward = 1.0
                             terminated = True
                             obs.info = "Correct! Task completed."
                         else:
@@ -121,7 +121,7 @@ class CloudAuditEnv:
                 elif self.task_id == "hard":
                     # Expecting rogue IP from auth-logs
                     if action.answer and action.answer.strip() == "192.168.1.50":
-                        reward = 1.0
                         terminated = True
                         obs.info = "Correct! Rogue IP identified. Task completed."
                     else:
@@ -130,7 +130,7 @@ class CloudAuditEnv:
                 elif self.task_id == "medium":
                     obs.info = "For the medium task, use the 'modify' action to update the EC2 security group, not 'submit'."
-            self.score += reward
             obs.reward = reward
             obs.done = terminated or truncated
             return obs
@@ -139,7 +139,7 @@ class CloudAuditEnv:
             import traceback
             print(f"ERROR in environment.step: {str(e)}", file=sys.stderr)
             traceback.print_exc(file=sys.stderr)
-            return CloudObservation(status=f"Internal Server Error: {str(e)}", reward=0.0, done=True)
     def state(self) -> CloudState:
         return CloudState(

         self.episode_id = str(uuid.uuid4())
         self.step_count = 0
         self.is_completed = False
+        self.score = 0.01
         # Mock Infrastructure
         self.resources = {
         """Required by openenv-core 0.1.1: takes task_id, returns JUST the observation."""
         self.task_id = task_id
         self._initialize_state()
+        return CloudObservation(info=f"Environment reset. Task: {self.task_id}", reward=0.01, done=False)
     def step(self, action: CloudAction) -> CloudObservation:
         """Required by openenv-core 0.1.1: takes action, returns JUST the observation with reward/done fields."""
         try:
             self.step_count += 1
+            reward = 0.005
             terminated = False
             truncated = self.step_count >= 20  # Limit steps
                     rules = self.resources["ec2"][0]["security_groups"][0]["rules"]
                     has_rdp = any(r["port"] == 3389 and r["cidr"] == "0.0.0.0/0" for r in rules)
                     if not has_rdp:
+                        reward = 0.85
                         terminated = True
                         obs.info = "Success! Port 3389 removed. Task completed."
                     else:
                         answers = [a.strip() for a in action.answer.split(",")]
                         expected = ["prod-data-001"]
                         if set(answers) == set(expected):
+                            reward = 0.85
                             terminated = True
                             obs.info = "Correct! Task completed."
                         else:
                 elif self.task_id == "hard":
                     # Expecting rogue IP from auth-logs
                     if action.answer and action.answer.strip() == "192.168.1.50":
+                        reward = 0.85
                         terminated = True
                         obs.info = "Correct! Rogue IP identified. Task completed."
                     else:
                 elif self.task_id == "medium":
                     obs.info = "For the medium task, use the 'modify' action to update the EC2 security group, not 'submit'."
+            self.score = min(0.99, self.score + reward)
             obs.reward = reward
             obs.done = terminated or truncated
             return obs
             import traceback
             print(f"ERROR in environment.step: {str(e)}", file=sys.stderr)
             traceback.print_exc(file=sys.stderr)
+            return CloudObservation(status=f"Internal Server Error: {str(e)}", reward=0.01, done=True)
     def state(self) -> CloudState:
         return CloudState(

server/tasks.py CHANGED Viewed

@@ -5,19 +5,19 @@ TASKS = {
         "id": "easy",
         "description": "Identify all public S3 buckets in the 'prod' region. Submit their IDs as a comma-separated string.",
         "difficulty": "Easy",
-        "reward_threshold": 1.0,
     },
     "medium": {
         "id": "medium",
         "description": "Find the EC2 instance with ID 'i-0abcdef1234567890' in the 'dev' environment. Close port 3389 (RDP) by updating its security group rules to only allow port 22.",
         "difficulty": "Medium",
-        "reward_threshold": 1.0,
     },
     "hard": {
         "id": "hard",
         "description": "A rogue IAM role 'iam-role-01' has been performing unauthorized actions. Fetch the 'auth-logs' and identify the IP address that performed 'DeleteStorage'. Submit the IP address.",
         "difficulty": "Hard",
-        "reward_threshold": 1.0,
     }
 }

         "id": "easy",
         "description": "Identify all public S3 buckets in the 'prod' region. Submit their IDs as a comma-separated string.",
         "difficulty": "Easy",
+        "reward_threshold": 0.8,
     },
     "medium": {
         "id": "medium",
         "description": "Find the EC2 instance with ID 'i-0abcdef1234567890' in the 'dev' environment. Close port 3389 (RDP) by updating its security group rules to only allow port 22.",
         "difficulty": "Medium",
+        "reward_threshold": 0.8,
     },
     "hard": {
         "id": "hard",
         "description": "A rogue IAM role 'iam-role-01' has been performing unauthorized actions. Fetch the 'auth-logs' and identify the IP address that performed 'DeleteStorage'. Submit the IP address.",
         "difficulty": "Hard",
+        "reward_threshold": 0.8,
     }
 }