{ "rounds": 3, "output_dir": "artifacts/self_play", "dry_run": true, "canonical_graph_mode": "generate", "pipeline_mode": "swarm_v2", "model_topology": "dual", "phase_schedule": "generator_answerer", "tuning_mode": "full", "shared_model_name_or_path": "", "seed_tasks_per_round": 16, "generated_tasks_per_round": 24, "generator_prompts_per_round": 24, "max_graph_context_nodes": 100, "max_graph_context_edges": 100, "max_support_edges": 8, "answerer_judge_max_new_tokens": 48, "generator_reward_weights": { "validity": 0.35, "hardness": 0.45, "diversity": 0.1, "consistency": 0.1 }, "lora": { "r": 16, "alpha": 32, "dropout": 0.05, "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"], "bias": "none", "task_type": "CAUSAL_LM" }, "swarm_v2": { "generator_swarm": { "shared_context": true, "max_agents": 4, "max_breadth": 3, "max_depth": 2, "planner_rounds": 2, "tools_per_agent": 2 }, "answerer_swarm": { "shared_context": true, "max_agents": 3, "max_breadth": 2, "max_depth": 2, "planner_rounds": 2, "tools_per_agent": 2 }, "validation": { "max_support_edges": 8, "max_path_hops": 4, "max_context_nodes": 14, "max_context_edges": 8, "duplicate_similarity_threshold": 0.8 }, "shared_context": { "shared_by_default": true, "max_nodes": 14, "max_edges": 8, "target_pressure": 0.85 } }, "generator_phase": { "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", "learning_rate": 1e-06, "max_steps": 64, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 4, "num_generations": 4, "max_completion_length": 256, "temperature": 1.0, "top_p": 1.0, "beta": 0.01, "epsilon": 0.2, "num_iterations": 1, "loss_type": "dapo", "scale_rewards": "none", "logging_steps": 10, "save_steps": 50, "output_subdir": "generator", "use_vllm": false, "vllm_mode": "colocate" }, "answerer_phase": { "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", "learning_rate": 1e-06, "max_steps": 64, "per_device_train_batch_size": 2, "gradient_accumulation_steps": 4, "num_generations": 4, "max_completion_length": 192, "temperature": 1.0, "top_p": 1.0, "beta": 0.01, "epsilon": 0.2, "num_iterations": 1, "loss_type": "dapo", "scale_rewards": "none", "logging_steps": 10, "save_steps": 50, "output_subdir": "answerer", "use_vllm": false, "vllm_mode": "colocate" } }