| [ | |
| "tests/test_ast_validator.py::test_attribute_eval_fails", | |
| "tests/test_ast_validator.py::test_builtins_assignment_fails", | |
| "tests/test_ast_validator.py::test_clean_script_passes", | |
| "tests/test_ast_validator.py::test_eval_fails", | |
| "tests/test_ast_validator.py::test_os_import_fails", | |
| "tests/test_ast_validator.py::test_socket_import_fails", | |
| "tests/test_ast_validator.py::test_subprocess_fails", | |
| "tests/test_ast_validator.py::test_syntax_error_fails", | |
| "tests/test_ast_validator.py::test_transformers_import_passes", | |
| "tests/test_environment.py::test_action_validation_rejects_both_or_neither", | |
| "tests/test_environment.py::test_full_episode_lifecycle", | |
| "tests/test_environment.py::test_invalid_action_for_phase", | |
| "tests/test_environment.py::test_reset_returns_drift_gen_observation", | |
| "tests/test_environment.py::test_state_property_is_dict", | |
| "tests/test_environment.py::test_step_before_reset_returns_error", | |
| "tests/test_environment.py::test_teacher_updates_after_episode", | |
| "tests/test_environment.py::test_unified_diff_full_script_replacement", | |
| "tests/test_environment.py::test_unified_diff_round_trip", | |
| "tests/test_evaluators.py::test_alignment_score_anti_correlation", | |
| "tests/test_evaluators.py::test_alignment_score_constant_returns_zero", | |
| "tests/test_evaluators.py::test_alignment_score_perfect_correlation", | |
| "tests/test_evaluators.py::test_drift_gen_reward_combines_signals", | |
| "tests/test_evaluators.py::test_held_out_success", | |
| "tests/test_evaluators.py::test_held_out_workaround_detection", | |
| "tests/test_evaluators.py::test_repetition_penalty_higher_for_duplicates", | |
| "tests/test_evaluators.py::test_uncertainty_handles_empty", | |
| "tests/test_evaluators.py::test_uncertainty_peaks_at_half", | |
| "tests/test_evaluators.py::test_visible_reward_failure", | |
| "tests/test_evaluators.py::test_visible_reward_success", | |
| "tests/test_primitives.py::test_all_8_primitives_registered", | |
| "tests/test_primitives.py::test_breakage_creates_actual_difference", | |
| "tests/test_primitives.py::test_breakage_repair_registry_alignment", | |
| "tests/test_primitives.py::test_change_argument_signature_removes_kwarg", | |
| "tests/test_primitives.py::test_change_return_type_swaps_access", | |
| "tests/test_primitives.py::test_change_tokenizer_behavior_replaces_kwarg", | |
| "tests/test_primitives.py::test_deprecate_import", | |
| "tests/test_primitives.py::test_modify_config_field_changes_value", | |
| "tests/test_primitives.py::test_parse_spec_ignores_extra_kwargs", | |
| "tests/test_primitives.py::test_parse_spec_round_trip", | |
| "tests/test_primitives.py::test_parse_spec_unknown_raises", | |
| "tests/test_primitives.py::test_remove_deprecated_method_marks_call", | |
| "tests/test_primitives.py::test_rename_api_call_word_boundary", | |
| "tests/test_primitives.py::test_restructure_dataset_string_replacement", | |
| "tests/test_primitives.py::test_seed_corpus_has_at_least_10_scripts", | |
| "tests/test_primitives.py::test_task_sampler_categories_are_diverse", | |
| "tests/test_primitives.py::test_task_sampler_difficulty_filter", | |
| "tests/test_primitives.py::test_task_sampler_get_by_id", | |
| "tests/test_roles.py::test_baseline_drift_generator_produces_valid_spec", | |
| "tests/test_roles.py::test_baseline_drift_generator_spec_actually_breaks_script", | |
| "tests/test_roles.py::test_baseline_repair_agent_inverts_breakage_spec", | |
| "tests/test_roles.py::test_baseline_repair_agent_oracle_path", | |
| "tests/test_roles.py::test_extract_diff_strips_chain_of_thought", | |
| "tests/test_roles.py::test_extract_diff_strips_fences", | |
| "tests/test_roles.py::test_looks_like_diff_negative", | |
| "tests/test_roles.py::test_looks_like_diff_positive", | |
| "tests/test_roles.py::test_parse_drift_output_handles_fences", | |
| "tests/test_roles.py::test_parse_drift_output_handles_prose", | |
| "tests/test_roles.py::test_parse_drift_output_returns_none_on_garbage", | |
| "tests/test_roles.py::test_parse_drift_to_primitive_unknown_type", | |
| "tests/test_roles.py::test_parse_drift_to_primitive_validates", | |
| "tests/test_roles.py::test_prompts_are_nonempty", | |
| "tests/test_roles.py::test_render_drift_generator_prompt_includes_inputs", | |
| "tests/test_roles.py::test_render_repair_agent_prompt_includes_error_trace", | |
| "tests/test_simulation_mode.py::test_forbidden_import_fails", | |
| "tests/test_simulation_mode.py::test_seed_is_deterministic", | |
| "tests/test_simulation_mode.py::test_simulation_is_fast", | |
| "tests/test_simulation_mode.py::test_syntax_error_fails", | |
| "tests/test_simulation_mode.py::test_valid_script_can_succeed", | |
| "tests/test_training.py::test_grpo_drift_dry_run_smoke", | |
| "tests/test_training.py::test_grpo_repair_dry_run_smoke", | |
| "tests/test_training.py::test_rollout_one_episode_baseline_no_op_repair", | |
| "tests/test_training.py::test_rollout_one_episode_with_oracle_repair_succeeds", | |
| "tests/test_warmstart.py::test_generate_pairs_covers_multiple_primitive_types", | |
| "tests/test_warmstart.py::test_generate_pairs_produces_minimum_count" | |
| ] |