Spaces:

Humanlearning
/

Cyber_analyst-round1

Sleeping

App Files Files Community

Humanlearning commited on 13 days ago

Commit

2eada22

1 Parent(s): be8eade

feat: add episode trace fingerprinting for improved trace logging and update reward penalties in GRPO configuration

Browse files

Files changed (5) hide show

scripts/modal_train_grpo.py +18 -3
tests/test_rewards.py +2 -1
tests/test_trackio_utils.py +30 -0
training/configs/grpo_small.yaml +4 -4
training/trackio_utils.py +18 -0

scripts/modal_train_grpo.py CHANGED Viewed

@@ -518,6 +518,7 @@ def train_cybersecurity_owasp_grpo(
     from training.trackio_utils import (
         aggregate_episode_metrics,
         episode_record_from_state,
         log_gpu_metrics,
         log_trace_table,
         log_trackio_metrics,
@@ -886,6 +887,7 @@ def train_cybersecurity_owasp_grpo(
                 pass
     trace_step = {"value": 0}
     def _completion_to_text(completion) -> str:
         if completion is None:
@@ -950,16 +952,28 @@ def train_cybersecurity_owasp_grpo(
         except Exception as exc:
             print(f"Trackio metric logging skipped: {exc!r}")
         try:
             log_trace_table(
-                episode_records[: min(4, len(episode_records))],
                 table_name="sample_traces",
                 step=trace_step["value"],
             )
         except Exception as exc:
             print(f"Trackio sample trace table logging skipped: {exc!r}")
-        for index, env in enumerate(environments):
             messages = list(getattr(env, "trace_messages", []))
             if index < len(completions):
                 completion_text = _completion_to_text(completions[index])
@@ -974,8 +988,9 @@ def train_cybersecurity_owasp_grpo(
             metadata.update(
                 {
                     "sample_index": index,
-                    "reward": rewards[index],
                     "trace_step": trace_step["value"],
                     "run_name": run_name,
                 }
             )

     from training.trackio_utils import (
         aggregate_episode_metrics,
         episode_record_from_state,
+        episode_trace_fingerprint,
         log_gpu_metrics,
         log_trace_table,
         log_trackio_metrics,
                 pass
     trace_step = {"value": 0}
+    logged_trace_fingerprints: set[str] = set()
     def _completion_to_text(completion) -> str:
         if completion is None:
         except Exception as exc:
             print(f"Trackio metric logging skipped: {exc!r}")
+        sampled_traces = []
+        seen_this_batch: set[str] = set()
+        for index, (env, record, reward) in enumerate(zip(environments, episode_records, rewards)):
+            fingerprint = episode_trace_fingerprint(record)
+            if fingerprint in seen_this_batch or fingerprint in logged_trace_fingerprints:
+                continue
+            seen_this_batch.add(fingerprint)
+            logged_trace_fingerprints.add(fingerprint)
+            sampled_traces.append((index, env, record, reward, fingerprint))
+            if len(sampled_traces) >= 4:
+                break
         try:
             log_trace_table(
+                [record for _, _, record, _, _ in sampled_traces],
                 table_name="sample_traces",
                 step=trace_step["value"],
             )
         except Exception as exc:
             print(f"Trackio sample trace table logging skipped: {exc!r}")
+        for index, env, _record, reward, fingerprint in sampled_traces:
             messages = list(getattr(env, "trace_messages", []))
             if index < len(completions):
                 completion_text = _completion_to_text(completions[index])
             metadata.update(
                 {
                     "sample_index": index,
+                    "reward": reward,
                     "trace_step": trace_step["value"],
+                    "trace_fingerprint": fingerprint,
                     "run_name": run_name,
                 }
             )

tests/test_rewards.py CHANGED Viewed

@@ -114,8 +114,9 @@ def test_repeated_futile_actions_are_penalized(monkeypatch):
     assert first.reward_breakdown["progressive"] > 0.0
     assert second.reward_breakdown["progressive"] == 0.0
-    assert second.reward_breakdown["behavior_penalty"] <= -0.10
     assert second.reward_breakdown["total"] < 0.0
 def test_dense_episode_reward_cap_blocks_repeated_positive_farming(monkeypatch):

     assert first.reward_breakdown["progressive"] > 0.0
     assert second.reward_breakdown["progressive"] == 0.0
+    assert second.reward_breakdown["behavior_penalty"] <= -0.50
     assert second.reward_breakdown["total"] < 0.0
+    assert env.state.accumulated_reward < 0.0
 def test_dense_episode_reward_cap_blocks_repeated_positive_farming(monkeypatch):

tests/test_trackio_utils.py CHANGED Viewed

@@ -6,6 +6,7 @@ from training.trackio_utils import (
     DERIVED_TRACKIO_METRICS,
     aggregate_episode_metrics,
     episode_record_from_state,
     episode_to_trace_row,
     episode_to_tracking_fields,
 )
@@ -91,3 +92,32 @@ def test_trace_rows_redact_hidden_values_from_action_arguments():
             assert not value or value not in row_text
     finally:
         env.close()

     DERIVED_TRACKIO_METRICS,
     aggregate_episode_metrics,
     episode_record_from_state,
+    episode_trace_fingerprint,
     episode_to_trace_row,
     episode_to_tracking_fields,
 )
             assert not value or value not in row_text
     finally:
         env.close()
+def test_trace_fingerprint_ignores_episode_id_but_tracks_action_changes():
+    base_record = {
+        "episode_id": "episode-a",
+        "task_id": "task-1",
+        "scenario/seed": 123,
+        "scenario/split": "train",
+        "scenario/difficulty": 0,
+        "scenario/bug_type": "bola_idor",
+        "action_history": [
+            {
+                "tool_name": "read_file",
+                "arguments": {"path": "app/routes/invoices.py"},
+            }
+        ],
+        "observation_history": [{"last_action_valid": True}],
+        "reward_breakdown": {"total": 0.0},
+    }
+    same_trace = dict(base_record)
+    same_trace["episode_id"] = "episode-b"
+    changed_trace = dict(base_record)
+    changed_trace["action_history"] = [
+        *base_record["action_history"],
+        {"tool_name": "submit_fix", "arguments": {}},
+    ]
+    assert episode_trace_fingerprint(base_record) == episode_trace_fingerprint(same_trace)
+    assert episode_trace_fingerprint(base_record) != episode_trace_fingerprint(changed_trace)

training/configs/grpo_small.yaml CHANGED Viewed

@@ -90,19 +90,19 @@ reward:
     value: -0.30
     description: "Penalty for repeating the same failed action."
   repeated_low_value_action:
-    value: -0.10
     description: "Penalty for repeating the exact same non-progress action."
   no_progress_action:
-    value: -0.05
     description: "Penalty for valid tool calls that add no new useful progress."
   noop_action:
     value: -0.02
     description: "Small penalty for spending a step without acting."
   repeated_file_read:
-    value: -0.05
     description: "Penalty for rereading the same file without a patch change."
   repeated_local_request:
-    value: -0.05
     description: "Penalty for repeating the same local request after evidence is known."
   repeated_visible_tests:
     value: -0.10

     value: -0.30
     description: "Penalty for repeating the same failed action."
   repeated_low_value_action:
+    value: -0.40
     description: "Penalty for repeating the exact same non-progress action."
   no_progress_action:
+    value: -0.15
     description: "Penalty for valid tool calls that add no new useful progress."
   noop_action:
     value: -0.02
     description: "Small penalty for spending a step without acting."
   repeated_file_read:
+    value: -0.20
     description: "Penalty for rereading the same file without a patch change."
   repeated_local_request:
+    value: -0.20
     description: "Penalty for repeating the same local request after evidence is known."
   repeated_visible_tests:
     value: -0.10

training/trackio_utils.py CHANGED Viewed

@@ -882,6 +882,24 @@ def trace_table_rows(episodes: Sequence[Any]) -> list[dict[str, Any]]:
     return [episode_to_trace_row(episode) for episode in episodes]
 def log_trace_table(
     episodes: Sequence[Any],
     *,

     return [episode_to_trace_row(episode) for episode in episodes]
+def episode_trace_fingerprint(episode: Any) -> str:
+    """Return a stable fingerprint for a redacted trace row.
+    The episode id is intentionally excluded so repeated GRPO samples with the
+    same scenario/action trace do not appear as separate Trackio examples.
+    """
+    row = episode_to_trace_row(episode)
+    return _stable_hash(
+        {
+            key: row.get(key, "")
+            for key in TRACE_TABLE_COLUMNS
+            if key != "episode_id"
+        },
+        length=24,
+    )
 def log_trace_table(
     episodes: Sequence[Any],
     *,