Video-R1 / src /generate_cot_vllm.py

Add files using upload-large-folder tool

bb7f76d verified 8 months ago

10.5 kB

	import os
	import json
	import re
	from tqdm import tqdm
	from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
	from rouge_score import rouge_scorer
	import torch

	from transformers import AutoProcessor, AutoTokenizer
	from vllm import LLM, SamplingParams
	from qwen_vl_utils import process_vision_info


	MODEL_PATH = "Qwen/Qwen2.5-VL-72B-Instruct"
	BSZ = 32


	llm = LLM(
	model=MODEL_PATH,
	tensor_parallel_size=torch.cuda.device_count(),
	max_model_len = 8192,
	gpu_memory_utilization=0.8,
	limit_mm_per_prompt={"image": 10, "video": 10},
	)

	sampling_params = SamplingParams(
	temperature=1.0,
	top_p=0.95,
	max_tokens=512,
	stop_token_ids=[],
	)


	processor = AutoProcessor.from_pretrained(MODEL_PATH)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
	tokenizer.padding_side = "left"
	processor.tokenizer = tokenizer

	for dataset_name in ['your_data_name']:

	OUTPUT_PATH = f"./src/r1-v/Video-R1-data/{dataset_name}_COT_qwen72b.json"
	PROMPT_PATH = f"./src/r1-v/Video-R1-data/{dataset_name}.json"

	data = []
	if PROMPT_PATH.endswith('.jsonl'):
	with open(PROMPT_PATH, "r", encoding="utf-8") as f:
	for line in f:
	data.append(json.loads(line))
	elif PROMPT_PATH.endswith('.json'):
	with open(PROMPT_PATH, "r", encoding="utf-8") as f:
	data = json.load(f)
	else:
	raise ValueError("Input file must be .json or .jsonl")


	QUESTION_TEMPLATE = (
	"{Question}\n"
	"Please think about this question as if you were a human pondering deeply. "
	"Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
	"It's encouraged to include self-reflection or verification in the reasoning process. "
	"Provide your detailed reasoning between the <think> and </think> tags, and then give your final answer between the <answer> and </answer> tags."
	)

	TYPE_TEMPLATE = {
	"multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
	"numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
	"OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
	"free-form": " Please provide your text answer within the <answer> </answer> tags.",
	"regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags."
	}


	messages = []
	for x in data:
	if x["problem_type"] == 'multiple choice':
	question = x['problem'] + "Options:\n"
	for op in x["options"]:
	question += op + "\n"
	else:
	question = x['problem']

	msg = [{
	"role": "user",
	"content": [
	{
	"type": x['data_type'],
	x['data_type']: os.getcwd() + "/src/r1-v/Video-R1-data" + x['path'][1:]
	},
	{
	"type": "text",
	"text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[x['problem_type']]
	}
	]
	}]
	messages.append(msg)

	# For resume
	final_output = []
	start_idx = 0
	if os.path.exists(OUTPUT_PATH):
	try:
	with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
	existing = json.load(f)
	final_output = existing.get("results", [])
	start_idx = len(final_output)
	print(f"Resuming from sample index {start_idx}")
	except Exception as e:
	print(f"Error reading existing output file: {e}")

	def extract_think(output_str):
	pattern = r'<think>\s(.?)\s*</think>'
	match = re.search(pattern, output_str, re.DOTALL)
	if match:
	return match.group(1).strip()
	return ""

	def extract_answer(text):
	pattern = r'<answer>\s(.?)\s*</answer>'
	match = re.search(pattern, text, re.DOTALL)
	if match:
	return match.group(1).strip()
	return ""

	def normalize_number(num_str):
	try:
	num_str = num_str.replace(',', '')
	return float(num_str)
	except Exception as e:
	print(f"Error converting '{num_str}' to float: {e}")
	return None

	def wer(reference, hypothesis):
	ref_words = reference.split()
	hyp_words = hypothesis.split()
	m = len(ref_words)
	n = len(hyp_words)
	d = [[0]*(n+1) for _ in range(m+1)]
	for i in range(m+1):
	d[i][0] = i
	for j in range(n+1):
	d[0][j] = j
	for i in range(1, m+1):
	for j in range(1, n+1):
	if ref_words[i-1] == hyp_words[j-1]:
	d[i][j] = d[i-1][j-1]
	else:
	d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
	return d[m][n] / max(1, m)

	def compute_bleu_score(reference, hypothesis):
	try:
	smoothing = SmoothingFunction().method1
	ref_tokens = reference.split()
	hyp_tokens = hypothesis.split()
	score = sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smoothing)
	return score
	except Exception as e:
	print(f"Error computing BLEU score: {e}")
	return 0.0

	def compute_rouge_score(reference, hypothesis, use_stemmer=True):
	scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
	scores = scorer.score(reference, hypothesis)
	average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
	return average_fmeasure

	def reward_fn(sample, model_output, question_type):
	try:
	output_ans = extract_answer(model_output)
	gt_ans = extract_answer(sample.get("solution", ""))
	if question_type == "multiple choice":
	return 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
	elif question_type == "numerical":
	gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
	out_has_decimal = ("." in output_ans) or ("," in output_ans)
	if gt_has_decimal != out_has_decimal:
	return 0.0
	gt_number = normalize_number(gt_ans)
	out_number = normalize_number(output_ans)
	if gt_number is None or out_number is None:
	return 0.0
	return 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
	elif question_type == "OCR":
	error_rate = wer(gt_ans, output_ans)
	reward = 1 - error_rate
	return max(0.0, min(1.0, reward))
	elif question_type == "free-form":
	score = compute_rouge_score(gt_ans, output_ans)
	return max(0.0, min(1.0, score))
	elif question_type == "regression":
	gt_number = normalize_number(gt_ans)
	out_number = normalize_number(output_ans)
	if gt_number is None or out_number is None:
	return 0.0
	rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
	rel_diff = min(1.0, max(0.0, rel_diff))
	return 1 - rel_diff
	else:
	return 0.0
	except Exception as e:
	print(f"Error in reward_fn for question_type '{question_type}': {e}")
	return 0.0


	for i in tqdm(range(start_idx, len(messages), BSZ), desc="Processing batches"):
	batch_messages = messages[i:i + BSZ]

	prompts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]

	try:
	image_inputs, video_inputs, video_kwargs = process_vision_info(batch_messages, return_video_kwargs=True)

	image_idx = 0
	video_idx = 0

	llm_inputs = []


	for idx, prompt in enumerate(prompts):
	mm_type = batch_messages[idx][0]['content'][0]['type']
	sample_mm_data = {}
	sample_video_kw = {}
	if mm_type == 'image':
	sample_mm_data["image"] = image_inputs[image_idx]
	image_idx += 1
	elif mm_type == 'video':
	sample_mm_data["video"] = video_inputs[video_idx]
	for key, value in video_kwargs.items():
	sample_video_kw[key] = value[video_idx]
	video_idx += 1


	llm_inputs.append({
	"prompt": prompt,
	"multi_modal_data": sample_mm_data,
	"mm_processor_kwargs": sample_video_kw,
	})


	outputs = llm.generate(llm_inputs, sampling_params=sampling_params)
	batch_output_text = [out.outputs[0].text for out in outputs]

	except Exception as e:
	print('error:', data[i]['path'])
	batch_output_text = ['<answer>error</answer>'] * BSZ


	for j, (sample, model_output) in enumerate(zip(data[i:i+BSZ], batch_output_text), start=i):
	think_chain = extract_think(model_output)
	final_ans = extract_answer(model_output)
	sample["answer"] = final_ans
	q_type = sample.get("problem_type", "")
	sample["reward"] = reward_fn(sample, model_output, q_type)
	sample['select'] = True if sample["reward"] > 0.6 else False
	if think_chain:
	sample["process"] = f"<think>{think_chain}</think>"
	final_output.append(sample)

	try:
	with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
	json.dump({"results": final_output}, f, indent=2, ensure_ascii=False)
	print(f"Processed batch {(i - start_idx)//BSZ + 1}, saved {len(final_output)} samples.")
	except Exception as e:
	print(f"Error writing to output file: {e}")

	print(f"Results saved to {OUTPUT_PATH}")