| from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig |
| import torch |
| import pandas as pd |
| import wandb |
|
|
| |
| wandb.init( |
| project="reward_model_scoring", |
| name="fomatted_5e-6_1500", |
| ) |
|
|
| |
| rm_path = "/home/ckpt/5e-6/global_step180_hf" |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(rm_path) |
|
|
| |
| config = AutoConfig.from_pretrained(rm_path) |
| config.num_labels = 1 |
|
|
| |
| model = AutoModelForSequenceClassification.from_pretrained( |
| rm_path, |
| config=config, |
| device_map="auto" |
| ) |
| model.eval() |
|
|
| |
| def get_reward_score(texts): |
| inputs = tokenizer( |
| texts, |
| return_tensors="pt", |
| padding=True, |
| truncation=True, |
| max_length=8192, |
| ).to(model.device) |
|
|
| with torch.no_grad(): |
| outputs = model(**inputs) |
| scores = outputs.logits.squeeze(-1).float().cpu().tolist() |
| return scores |
|
|
| |
| df = pd.read_parquet("/home/data/formatted_test.parquet").sample(n=1500, random_state=42).reset_index(drop=True) |
| |
|
|
| def format_input(prompt, answer): |
| return prompt + answer |
|
|
| chosen_texts = [format_input(p, a) for p, a in zip(df["chosen_prompt"], df["chosen"])] |
| rejected_texts = [format_input(p, a) for p, a in zip(df["chosen_prompt"], df["reject"])] |
|
|
| |
| chosen_scores, rejected_scores, accs = [], [], [] |
|
|
| |
| sample_table = wandb.Table(columns=[ |
| "index", "prompt", "chosen", "rejected", |
| "chosen_score", "rejected_score", "delta_score", "acc" |
| ]) |
|
|
| |
| batch_size = 16 |
| for i in range(0, len(chosen_texts), batch_size): |
| chosen_batch = chosen_texts[i:i+batch_size] |
| rejected_batch = rejected_texts[i:i+batch_size] |
|
|
| chosen_batch_scores = get_reward_score(chosen_batch) |
| rejected_batch_scores = get_reward_score(rejected_batch) |
|
|
| for j in range(len(chosen_batch_scores)): |
| idx = i + j |
| c_score = chosen_batch_scores[j] |
| r_score = rejected_batch_scores[j] |
| delta = c_score - r_score |
| acc = int(delta > 0) |
|
|
|
|
| |
| chosen_scores.append(c_score) |
| rejected_scores.append(r_score) |
| accs.append(acc) |
| current_accuracy = sum(accs) / len(accs) |
| print(f"[{idx}] acc={acc}, chosen_reward={c_score:.3f}, reject_reward={r_score:.3f} | 当前平均准确率: {current_accuracy:.3f}") |
|
|
| |
| sample_table.add_data( |
| idx, |
| df.loc[idx, "chosen_prompt"], |
| df.loc[idx, "chosen"], |
| df.loc[idx, "reject"], |
| c_score, |
| r_score, |
| delta, |
| acc |
| ) |
|
|
| |
| df["chosen_score"] = chosen_scores |
| df["rejected_score"] = rejected_scores |
| df["delta_score"] = df["chosen_score"] - df["rejected_score"] |
| df["acc"] = accs |
|
|
| |
| accuracy = df["acc"].mean() |
| mean_chosen = df["chosen_score"].mean() |
| mean_rejected = df["rejected_score"].mean() |
| mean_delta = df["delta_score"].mean() |
|
|
| print(f"\n✅ Reward Model Accuracy = {accuracy:.3f}") |
| print(f"📊 mean_chosen = {mean_chosen:.3f}, mean_rejected = {mean_rejected:.3f}, mean_delta = {mean_delta:.3f}") |
|
|
| |
| wandb.log({ |
| "samples_table": sample_table, |
| "final_accuracy": accuracy, |
| "mean_chosen_score": mean_chosen, |
| "mean_rejected_score": mean_rejected, |
| "mean_delta_score": mean_delta, |
| }) |
|
|
|
|
| |
| wandb.finish() |
|
|