| Check QUESTION_TEMPLATE! |
|
|
| kl: 0.01 |
| lr: 1e-6 |
| epoch: 1 |
| with cold-start data |
|
|
| noise = torch.randn_like(advantages) * 0.02 |
| advantages = advantages + noise |
|
|
| def format_reward(completions, **kwargs): |
| pattern = r"<think>.*?</think>\s*<answer>.*?</answer>" |
| completion_contents = [completion[0]["content"] for completion in completions] |
| reward = [] |
| for content in completion_contents: |
| for_re = 0.0 |
| if re.fullmatch(pattern, content, re.DOTALL) and recheck_format(content): |
| for_re += 0.5 |
| think = extract_first_think_answer(content) |
| for_re += min(len(think) / 1200, 1) * 0.5 |
| reward.append(for_re) |
| return reward |