rm_code / deepin_v1.py

Upload folder using huggingface_hub

d8a76be verified 8 months ago

6.67 kB

	# Requires: transformers>=4.51.0, torch, pandas, pyarrow, tqdm
	import os
	import math
	import pandas as pd
	from tqdm import tqdm
	import torch
	from datasets import load_dataset
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from sentence_transformers import CrossEncoder
	MODEL_NAME = "deeppin/Qwen3-Reranker-8B-SequenceClassification"
	DATA_PATH = "data/valid.parquet"
	BATCH_SIZE = 8
	MAX_LENGTH = 8192
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	def format_instruction(instruction, query, doc):
	# prefix = (
	# '<\|im_start\|>system\n'
	# 'You are a judge for retrieval-style matching between a roleplay prompt ("Query") and a candidate reply ("Document"). '
	# 'Score higher when the Document stays in persona, follows the context coherently, and is vivid/engaging.\n'
	# '<\|im_end\|>\n<\|im_start\|>user\n'
	# )
	# suffix = "<\|im_end\|>\n<\|im_start\|>assistant\n"
	# if instruction is None:
	# instruction = (
	# "Given a roleplay prompt, retrieve replies that best match persona adherence, plot continuity, and vividness."
	# )
	output = f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}"
	return output
	import re
	import re
	_SYS_BLOCK = re.compile(
	r"<\\|im_start\\|\>\ssystem\b.?<\\|im_end\\|\>", re.IGNORECASE \| re.DOTALL
	)
	_TURN_BLOCK = re.compile(
	r"<\\|im_start\\|\>\s(user\|assistant)\b\s(.?)\s<\\|im_end\\|\>",
	re.IGNORECASE \| re.DOTALL,
	)
	_ANY_CHATML_TAG = re.compile(r"<\\|[^\|]+?\\|>") # 清理残余 ChatML 标记，如 <\|im_start\|>

	_SYS = re.compile(r"<\\|im_start\\|\>\ssystem\b(.?)<\\|im_end\\|\>", re.I\|re.S)
	_TURN = re.compile(r"<\\|im_start\\|\>\s(user\|assistant)\b(.?)<\\|im_end\\|\>", re.I\|re.S)
	_TAG = re.compile(r"<\\|[^\|]+?\\|>")

	_START = re.compile(r"<\\|im_start\\|\>\s(system\|user\|assistant)\s", re.IGNORECASE)
	_END = re.compile(r"<\\|im_end\\|\>", re.IGNORECASE)
	_ANY = re.compile(r"<\\|[^\|>]+?\\|>", re.IGNORECASE)
	_THINK_BLOCK = re.compile(r"<think>.*?</think>", re.IGNORECASE \| re.DOTALL)

	def flatten_chatml(text: str, keep_think: bool = False, *, single_line: bool = False, sep: str = " ") -> str:
	if not isinstance(text, str):
	return ""

	t = text.replace("\r\n", "\n") # 统一行尾
	if not keep_think:
	t = _THINK_BLOCK.sub("", t)

	t = _START.sub("", t)
	t = _END.sub("\n", t) # 先把段落边界保留为换行，便于后面统一折叠
	t = _ANY.sub("", t)

	# 基本空白规整
	t = re.sub(r"[ \t]\n[ \t]", "\n", t)
	t = re.sub(r"\n{3,}", "\n\n", t)
	t = t.strip()

	if single_line:
	# 1) 全部换行（含 Unicode 分隔符）→ 指定分隔符
	t = t.replace("\r", "\n")
	t = re.sub(r"[\n\u2028\u2029]+", sep, t)
	# 2) 折叠多余空白（含制表符、不间断空格等）
	t = re.sub(r"[ \t\u00A0]{2,}", " ", t)
	t = re.sub(r"\s{2,}", " ", t)
	t = t.strip()
	return t
	# def format_instruction(instruction, query, doc):
	# prefix = '<\|im_start\|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<\|im_end\|>\n<\|im_start\|>user\n'
	# suffix = "<\|im_end\|>\n<\|im_start\|>assistant\n<think>\n\n</think>\n\n"
	# if instruction is None:
	# instruction = (
	# "Given a roleplay prompt and recent context, score candidate replies higher when they stay in character, continue the scene coherently, and feel vivid and engaging."
	# )
	# output = f"{prefix}<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}{suffix}"
	# return output

	# ===== 模型与分词器 =====
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_NAME,
	padding_side="left",
	use_fast=False,
	trust_remote_code=True,
	)
	tokenizer.truncation_side = "left"
	# 确保有 pad_token
	if tokenizer.pad_token_id is None:
	if tokenizer.eos_token_id is not None:
	tokenizer.pad_token = tokenizer.eos_token
	else:
	tokenizer.add_special_tokens({"pad_token": "<\|endoftext\|>"})

	# 常规精度版本（更稳妥，不易出现 NaN）
	# model = AutoModelForSequenceClassification.from_pretrained(
	# MODEL_NAME,
	# trust_remote_code=True,
	# ).to(DEVICE).eval()

	# 如需更快推理（需 GPU 且装好 FA2），用下面这行替换上面加载：
	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_NAME, torch_dtype=torch.float16, attn_implementation="flash_attention_2",
	trust_remote_code=True,
	).to("cuda").eval()
	model.config.pad_token_id = tokenizer.pad_token_id
	TASK = "Given a roleplay prompt and recent context, score candidate replies higher when they stay in character, continue the scene coherently, and feel vivid and engaging."

	# ===== 读取与清洗数据 =====
	df = pd.read_parquet(DATA_PATH)
	need_cols = ["chosen_prompt", "chosen", "reject"]
	for col in need_cols:
	if col not in df.columns:
	raise ValueError(f"缺少必要列：{col}")

	def norm_text(x):
	if x is None or (isinstance(x, float) and math.isnan(x)):
	return ""
	return str(x).strip()

	df = df[need_cols].copy()
	for col in need_cols:
	# 去 ChatML 标签并合并为单行（sep="" 表示紧贴；如果想要空格，用 sep=" "）
	df[col] = df[col].map(lambda s: flatten_chatml(norm_text(s), single_line=True, sep=""))

	# 过滤空样本
	mask = (df["chosen_prompt"].str.len()>0) & (df["chosen"].str.len()>0) & (df["reject"].str.len()>0)
	df = df[mask].reset_index(drop=True)
	total = len(df)
	if total == 0:
	raise ValueError("过滤后无有效样本。请检查数据内容。")
	print(f"[Info] 有效样本数: {total}")

	# ---------- 推理（逐样本两对比较） ----------
	correct = 0
	seen = 0

	for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scoring (per-sample)"):
	q_clean = row["chosen_prompt"]
	c_clean = row["chosen"]
	r_clean = row["reject"]

	p1 = format_instruction(TASK, q_clean, c_clean) # chosen
	p2 = format_instruction(TASK, q_clean, r_clean) # reject

	enc = tokenizer([p1, p2], padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
	enc = {k: v.to(DEVICE) for k, v in enc.items()}

	with torch.no_grad():
	logits = model(**enc).logits.squeeze(-1) # 形状 [2]

	l1, l2 = float(logits[0]), float(logits[1])
	is_correct = (l1 > l2) # 如果方向相反，改成 (l1 < l2)

	correct += int(is_correct)
	seen += 1
	print(f"[{idx}] logits={[l1, l2]} \| first>second={is_correct} \| running_acc={correct/seen:.2%} ({correct}/{seen})")

	print(f"\n[Result] Total={seen} \| Correct={correct} \| Accuracy={correct/seen:.2%}")