File size: 4,983 Bytes
b0fbec3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
[
  "tests/test_ast_validator.py::test_attribute_eval_fails",
  "tests/test_ast_validator.py::test_builtins_assignment_fails",
  "tests/test_ast_validator.py::test_clean_script_passes",
  "tests/test_ast_validator.py::test_eval_fails",
  "tests/test_ast_validator.py::test_os_import_fails",
  "tests/test_ast_validator.py::test_socket_import_fails",
  "tests/test_ast_validator.py::test_subprocess_fails",
  "tests/test_ast_validator.py::test_syntax_error_fails",
  "tests/test_ast_validator.py::test_transformers_import_passes",
  "tests/test_environment.py::test_action_validation_rejects_both_or_neither",
  "tests/test_environment.py::test_full_episode_lifecycle",
  "tests/test_environment.py::test_invalid_action_for_phase",
  "tests/test_environment.py::test_reset_returns_drift_gen_observation",
  "tests/test_environment.py::test_state_property_is_dict",
  "tests/test_environment.py::test_step_before_reset_returns_error",
  "tests/test_environment.py::test_teacher_updates_after_episode",
  "tests/test_environment.py::test_unified_diff_full_script_replacement",
  "tests/test_environment.py::test_unified_diff_round_trip",
  "tests/test_evaluators.py::test_alignment_score_anti_correlation",
  "tests/test_evaluators.py::test_alignment_score_constant_returns_zero",
  "tests/test_evaluators.py::test_alignment_score_perfect_correlation",
  "tests/test_evaluators.py::test_drift_gen_reward_combines_signals",
  "tests/test_evaluators.py::test_held_out_success",
  "tests/test_evaluators.py::test_held_out_workaround_detection",
  "tests/test_evaluators.py::test_repetition_penalty_higher_for_duplicates",
  "tests/test_evaluators.py::test_uncertainty_handles_empty",
  "tests/test_evaluators.py::test_uncertainty_peaks_at_half",
  "tests/test_evaluators.py::test_visible_reward_failure",
  "tests/test_evaluators.py::test_visible_reward_success",
  "tests/test_primitives.py::test_all_8_primitives_registered",
  "tests/test_primitives.py::test_breakage_creates_actual_difference",
  "tests/test_primitives.py::test_breakage_repair_registry_alignment",
  "tests/test_primitives.py::test_change_argument_signature_removes_kwarg",
  "tests/test_primitives.py::test_change_return_type_swaps_access",
  "tests/test_primitives.py::test_change_tokenizer_behavior_replaces_kwarg",
  "tests/test_primitives.py::test_deprecate_import",
  "tests/test_primitives.py::test_modify_config_field_changes_value",
  "tests/test_primitives.py::test_parse_spec_ignores_extra_kwargs",
  "tests/test_primitives.py::test_parse_spec_round_trip",
  "tests/test_primitives.py::test_parse_spec_unknown_raises",
  "tests/test_primitives.py::test_remove_deprecated_method_marks_call",
  "tests/test_primitives.py::test_rename_api_call_word_boundary",
  "tests/test_primitives.py::test_restructure_dataset_string_replacement",
  "tests/test_primitives.py::test_seed_corpus_has_at_least_10_scripts",
  "tests/test_primitives.py::test_task_sampler_categories_are_diverse",
  "tests/test_primitives.py::test_task_sampler_difficulty_filter",
  "tests/test_primitives.py::test_task_sampler_get_by_id",
  "tests/test_roles.py::test_baseline_drift_generator_produces_valid_spec",
  "tests/test_roles.py::test_baseline_drift_generator_spec_actually_breaks_script",
  "tests/test_roles.py::test_baseline_repair_agent_inverts_breakage_spec",
  "tests/test_roles.py::test_baseline_repair_agent_oracle_path",
  "tests/test_roles.py::test_extract_diff_strips_chain_of_thought",
  "tests/test_roles.py::test_extract_diff_strips_fences",
  "tests/test_roles.py::test_looks_like_diff_negative",
  "tests/test_roles.py::test_looks_like_diff_positive",
  "tests/test_roles.py::test_parse_drift_output_handles_fences",
  "tests/test_roles.py::test_parse_drift_output_handles_prose",
  "tests/test_roles.py::test_parse_drift_output_returns_none_on_garbage",
  "tests/test_roles.py::test_parse_drift_to_primitive_unknown_type",
  "tests/test_roles.py::test_parse_drift_to_primitive_validates",
  "tests/test_roles.py::test_prompts_are_nonempty",
  "tests/test_roles.py::test_render_drift_generator_prompt_includes_inputs",
  "tests/test_roles.py::test_render_repair_agent_prompt_includes_error_trace",
  "tests/test_simulation_mode.py::test_forbidden_import_fails",
  "tests/test_simulation_mode.py::test_seed_is_deterministic",
  "tests/test_simulation_mode.py::test_simulation_is_fast",
  "tests/test_simulation_mode.py::test_syntax_error_fails",
  "tests/test_simulation_mode.py::test_valid_script_can_succeed",
  "tests/test_training.py::test_grpo_drift_dry_run_smoke",
  "tests/test_training.py::test_grpo_repair_dry_run_smoke",
  "tests/test_training.py::test_rollout_one_episode_baseline_no_op_repair",
  "tests/test_training.py::test_rollout_one_episode_with_oracle_repair_succeeds",
  "tests/test_warmstart.py::test_generate_pairs_covers_multiple_primitive_types",
  "tests/test_warmstart.py::test_generate_pairs_produces_minimum_count"
]