| import os |
| import json |
| import re |
| from tqdm import tqdm |
| from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
| from rouge_score import rouge_scorer |
| import torch |
|
|
| from transformers import AutoProcessor, AutoTokenizer |
| from vllm import LLM, SamplingParams |
| from qwen_vl_utils import process_vision_info |
|
|
|
|
| MODEL_PATH = "Qwen/Qwen2.5-VL-72B-Instruct" |
| BSZ = 32 |
|
|
|
|
| llm = LLM( |
| model=MODEL_PATH, |
| tensor_parallel_size=torch.cuda.device_count(), |
| max_model_len = 8192, |
| gpu_memory_utilization=0.8, |
| limit_mm_per_prompt={"image": 10, "video": 10}, |
| ) |
|
|
| sampling_params = SamplingParams( |
| temperature=1.0, |
| top_p=0.95, |
| max_tokens=512, |
| stop_token_ids=[], |
| ) |
|
|
|
|
| processor = AutoProcessor.from_pretrained(MODEL_PATH) |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
| tokenizer.padding_side = "left" |
| processor.tokenizer = tokenizer |
|
|
| for dataset_name in ['your_data_name']: |
|
|
| OUTPUT_PATH = f"./src/r1-v/Video-R1-data/{dataset_name}_COT_qwen72b.json" |
| PROMPT_PATH = f"./src/r1-v/Video-R1-data/{dataset_name}.json" |
| |
| data = [] |
| if PROMPT_PATH.endswith('.jsonl'): |
| with open(PROMPT_PATH, "r", encoding="utf-8") as f: |
| for line in f: |
| data.append(json.loads(line)) |
| elif PROMPT_PATH.endswith('.json'): |
| with open(PROMPT_PATH, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| else: |
| raise ValueError("Input file must be .json or .jsonl") |
|
|
|
|
| QUESTION_TEMPLATE = ( |
| "{Question}\n" |
| "Please think about this question as if you were a human pondering deeply. " |
| "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions " |
| "It's encouraged to include self-reflection or verification in the reasoning process. " |
| "Provide your detailed reasoning between the <think> and </think> tags, and then give your final answer between the <answer> and </answer> tags." |
| ) |
|
|
| TYPE_TEMPLATE = { |
| "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.", |
| "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.", |
| "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.", |
| "free-form": " Please provide your text answer within the <answer> </answer> tags.", |
| "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags." |
| } |
|
|
|
|
| messages = [] |
| for x in data: |
| if x["problem_type"] == 'multiple choice': |
| question = x['problem'] + "Options:\n" |
| for op in x["options"]: |
| question += op + "\n" |
| else: |
| question = x['problem'] |
|
|
| msg = [{ |
| "role": "user", |
| "content": [ |
| { |
| "type": x['data_type'], |
| x['data_type']: os.getcwd() + "/src/r1-v/Video-R1-data" + x['path'][1:] |
| }, |
| { |
| "type": "text", |
| "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[x['problem_type']] |
| } |
| ] |
| }] |
| messages.append(msg) |
| |
| |
| final_output = [] |
| start_idx = 0 |
| if os.path.exists(OUTPUT_PATH): |
| try: |
| with open(OUTPUT_PATH, "r", encoding="utf-8") as f: |
| existing = json.load(f) |
| final_output = existing.get("results", []) |
| start_idx = len(final_output) |
| print(f"Resuming from sample index {start_idx}") |
| except Exception as e: |
| print(f"Error reading existing output file: {e}") |
|
|
| def extract_think(output_str): |
| pattern = r'<think>\s*(.*?)\s*</think>' |
| match = re.search(pattern, output_str, re.DOTALL) |
| if match: |
| return match.group(1).strip() |
| return "" |
|
|
| def extract_answer(text): |
| pattern = r'<answer>\s*(.*?)\s*</answer>' |
| match = re.search(pattern, text, re.DOTALL) |
| if match: |
| return match.group(1).strip() |
| return "" |
|
|
| def normalize_number(num_str): |
| try: |
| num_str = num_str.replace(',', '') |
| return float(num_str) |
| except Exception as e: |
| print(f"Error converting '{num_str}' to float: {e}") |
| return None |
|
|
| def wer(reference, hypothesis): |
| ref_words = reference.split() |
| hyp_words = hypothesis.split() |
| m = len(ref_words) |
| n = len(hyp_words) |
| d = [[0]*(n+1) for _ in range(m+1)] |
| for i in range(m+1): |
| d[i][0] = i |
| for j in range(n+1): |
| d[0][j] = j |
| for i in range(1, m+1): |
| for j in range(1, n+1): |
| if ref_words[i-1] == hyp_words[j-1]: |
| d[i][j] = d[i-1][j-1] |
| else: |
| d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1]) |
| return d[m][n] / max(1, m) |
|
|
| def compute_bleu_score(reference, hypothesis): |
| try: |
| smoothing = SmoothingFunction().method1 |
| ref_tokens = reference.split() |
| hyp_tokens = hypothesis.split() |
| score = sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=smoothing) |
| return score |
| except Exception as e: |
| print(f"Error computing BLEU score: {e}") |
| return 0.0 |
|
|
| def compute_rouge_score(reference, hypothesis, use_stemmer=True): |
| scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer) |
| scores = scorer.score(reference, hypothesis) |
| average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3 |
| return average_fmeasure |
|
|
| def reward_fn(sample, model_output, question_type): |
| try: |
| output_ans = extract_answer(model_output) |
| gt_ans = extract_answer(sample.get("solution", "")) |
| if question_type == "multiple choice": |
| return 1.0 if output_ans.strip() == gt_ans.strip() else 0.0 |
| elif question_type == "numerical": |
| gt_has_decimal = ("." in gt_ans) or ("," in gt_ans) |
| out_has_decimal = ("." in output_ans) or ("," in output_ans) |
| if gt_has_decimal != out_has_decimal: |
| return 0.0 |
| gt_number = normalize_number(gt_ans) |
| out_number = normalize_number(output_ans) |
| if gt_number is None or out_number is None: |
| return 0.0 |
| return 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0 |
| elif question_type == "OCR": |
| error_rate = wer(gt_ans, output_ans) |
| reward = 1 - error_rate |
| return max(0.0, min(1.0, reward)) |
| elif question_type == "free-form": |
| score = compute_rouge_score(gt_ans, output_ans) |
| return max(0.0, min(1.0, score)) |
| elif question_type == "regression": |
| gt_number = normalize_number(gt_ans) |
| out_number = normalize_number(output_ans) |
| if gt_number is None or out_number is None: |
| return 0.0 |
| rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9) |
| rel_diff = min(1.0, max(0.0, rel_diff)) |
| return 1 - rel_diff |
| else: |
| return 0.0 |
| except Exception as e: |
| print(f"Error in reward_fn for question_type '{question_type}': {e}") |
| return 0.0 |
|
|
|
|
| for i in tqdm(range(start_idx, len(messages), BSZ), desc="Processing batches"): |
| batch_messages = messages[i:i + BSZ] |
|
|
| prompts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages] |
| |
| try: |
| image_inputs, video_inputs, video_kwargs = process_vision_info(batch_messages, return_video_kwargs=True) |
| |
| image_idx = 0 |
| video_idx = 0 |
|
|
| llm_inputs = [] |
|
|
| |
| for idx, prompt in enumerate(prompts): |
| mm_type = batch_messages[idx][0]['content'][0]['type'] |
| sample_mm_data = {} |
| sample_video_kw = {} |
| if mm_type == 'image': |
| sample_mm_data["image"] = image_inputs[image_idx] |
| image_idx += 1 |
| elif mm_type == 'video': |
| sample_mm_data["video"] = video_inputs[video_idx] |
| for key, value in video_kwargs.items(): |
| sample_video_kw[key] = value[video_idx] |
| video_idx += 1 |
| |
| |
| llm_inputs.append({ |
| "prompt": prompt, |
| "multi_modal_data": sample_mm_data, |
| "mm_processor_kwargs": sample_video_kw, |
| }) |
| |
|
|
| outputs = llm.generate(llm_inputs, sampling_params=sampling_params) |
| batch_output_text = [out.outputs[0].text for out in outputs] |
| |
| except Exception as e: |
| print('error:', data[i]['path']) |
| batch_output_text = ['<answer>error</answer>'] * BSZ |
| |
|
|
| for j, (sample, model_output) in enumerate(zip(data[i:i+BSZ], batch_output_text), start=i): |
| think_chain = extract_think(model_output) |
| final_ans = extract_answer(model_output) |
| sample["answer"] = final_ans |
| q_type = sample.get("problem_type", "") |
| sample["reward"] = reward_fn(sample, model_output, q_type) |
| sample['select'] = True if sample["reward"] > 0.6 else False |
| if think_chain: |
| sample["process"] = f"<think>{think_chain}</think>" |
| final_output.append(sample) |
| |
| try: |
| with open(OUTPUT_PATH, "w", encoding="utf-8") as f: |
| json.dump({"results": final_output}, f, indent=2, ensure_ascii=False) |
| print(f"Processed batch {(i - start_idx)//BSZ + 1}, saved {len(final_output)} samples.") |
| except Exception as e: |
| print(f"Error writing to output file: {e}") |
|
|
| print(f"Results saved to {OUTPUT_PATH}") |
|
|