Spaces:

SpringWang08
/

Medical-VQA

Paused

App Files Files Community

Medical-VQA / src /engine /dpo_trainer.py

SpringWang08

Deploy Medical VQA app

d63774a 5 days ago

raw

history blame contribute delete

12.5 kB

	import torch
	import torch.nn.functional as F
	from tqdm import tqdm
	import json
	import os
	import random

	from src.utils.text_utils import get_target_answer, normalize_answer

	def _is_closed_question(question: str, answer: str) -> bool:
	q = normalize_answer(question)
	a = normalize_answer(answer)
	return (
	a in {"có", "không"}
	or q.endswith(" không")
	or " bình thường " in f" {q} "
	or " có " in f" {q} "
	)


	def _flip_closed_answer(answer: str) -> str:
	a = normalize_answer(answer)
	if a == "có":
	return "không"
	if a == "không":
	return "có"
	return a


	def _answer_category(question: str, answer: str) -> str:
	q = normalize_answer(question)
	a = normalize_answer(answer)
	if _is_closed_question(question, answer):
	return "closed"
	if any(term in q for term in ["ở đâu", "vi tri", "where"]):
	return "location"
	if any(term in a for term in ["trái", "phải", "trên", "dưới", "giữa", "bên"]):
	return "location"
	if any(term in a for term in ["mặt phẳng", "ngang", "vành", "dọc"]):
	return "plane"
	if any(term in a for term in ["gan", "phổi", "tim", "não", "thận", "lách", "bàng quang", "khí quản", "trung thất"]):
	return "organ"
	return "finding"


	def _build_answer_pools(data: list[dict], max_words: int) -> tuple[dict[str, list[str]], dict[str, list[str]]]:
	question_to_answers = {}
	category_to_answers = {}
	for item in data:
	question = item.get("question_vi", item.get("question", ""))
	answer = get_target_answer(item, max_words=max_words)
	if not question or not answer:
	continue
	q_norm = normalize_answer(question)
	a_norm = normalize_answer(answer)
	category = _answer_category(question, answer)
	question_to_answers.setdefault(q_norm, [])
	if a_norm not in question_to_answers[q_norm]:
	question_to_answers[q_norm].append(a_norm)
	category_to_answers.setdefault(category, [])
	if a_norm not in category_to_answers[category]:
	category_to_answers[category].append(a_norm)
	return question_to_answers, category_to_answers


	def _build_rejected_candidates(
	data: list[dict],
	idx: int,
	chosen: str,
	question_to_answers: dict[str, list[str]],
	category_to_answers: dict[str, list[str]],
	) -> list[str]:
	item = data[idx]
	question = item.get("question_vi", item.get("question", ""))
	question_norm = normalize_answer(question)
	chosen_norm = normalize_answer(chosen)
	category = _answer_category(question, chosen)
	candidates = []

	if _is_closed_question(question, chosen):
	flipped = _flip_closed_answer(chosen)
	if flipped and flipped != chosen_norm:
	candidates.append(flipped)
	else:
	for answer in question_to_answers.get(question_norm, []):
	if answer != chosen_norm:
	candidates.append(answer)
	for answer in category_to_answers.get(category, []):
	if answer != chosen_norm:
	candidates.append(answer)
	deduped = []
	seen = set()
	for candidate in candidates:
	candidate_norm = normalize_answer(candidate)
	if not candidate_norm or candidate_norm == chosen_norm or candidate_norm in seen:
	continue
	seen.add(candidate_norm)
	deduped.append(candidate_norm)
	return deduped

	def _build_pair_record(item: dict, source_idx: int, chosen: str, rejected: str) -> dict:
	return {
	"image": item.get("image_name") or item.get("image"),
	"source_idx": source_idx,
	"question": item["question_vi"],
	"chosen": chosen,
	"rejected": rejected,
	"answer_type": _answer_category(item["question_vi"], chosen),
	}


	def _round_robin_merge(grouped_pairs: dict[str, list[dict]], target_count: int) -> list[dict]:
	ordered_groups = sorted(grouped_pairs.keys())
	merged = []
	while len(merged) < target_count:
	progressed = False
	for group in ordered_groups:
	if grouped_pairs[group]:
	merged.append(grouped_pairs[group].pop())
	progressed = True
	if len(merged) >= target_count:
	break
	if not progressed:
	break
	return merged


	def create_preference_data(
	vqa_json_path,
	output_path,
	num_pairs=400,
	closed_ratio=0.6,
	max_answer_words=6,
	seed=42,
	):
	"""
	Tạo dữ liệu Preference (Chosen vs Rejected) cho DPO.
	Trong Medical VQA, 'Rejected' thường là các câu trả lời bị hallucination hoặc sai thuật ngữ y khoa.
	"""
	with open(vqa_json_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	question_to_answers, category_to_answers = _build_answer_pools(data, max_words=max_answer_words)
	rng = random.Random(seed)
	closed_pairs = []
	open_pairs_by_group = {"location": [], "plane": [], "organ": [], "finding": []}

	for i in range(len(data)):
	item = data[i]
	chosen = get_target_answer(item, max_words=max_answer_words)
	chosen_norm = normalize_answer(chosen)
	if not chosen_norm or len(chosen_norm.split()) > max_answer_words:
	continue
	rejected_candidates = _build_rejected_candidates(
	data,
	i,
	chosen_norm,
	question_to_answers=question_to_answers,
	category_to_answers=category_to_answers,
	)
	category = _answer_category(item["question_vi"], chosen_norm)

	for rejected in rejected_candidates:
	if len(rejected.split()) > max_answer_words:
	continue
	pair = _build_pair_record(item, i, chosen_norm, rejected)
	if category == "closed":
	closed_pairs.append(pair)
	elif category in open_pairs_by_group:
	open_pairs_by_group[category].append(pair)

	rng.shuffle(closed_pairs)
	for pairs in open_pairs_by_group.values():
	rng.shuffle(pairs)

	target_closed = min(len(closed_pairs), int(round(num_pairs * closed_ratio)))
	target_open = max(0, num_pairs - target_closed)
	sampled_closed = closed_pairs[:target_closed]
	sampled_open = _round_robin_merge(open_pairs_by_group, target_open)
	pref_data = sampled_closed + sampled_open
	rng.shuffle(pref_data)

	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump(pref_data, f, ensure_ascii=False, indent=2)

	print(
	f"[SUCCESS] Đã tạo {len(pref_data)} cặp preference dữ liệu tại {output_path} "
	f"(closed={len(sampled_closed)}, open={len(sampled_open)})"
	)
	preview_count = min(30, len(pref_data))
	if preview_count:
	print(f"[INFO] Preview {preview_count} cặp preference đầu tiên để kiểm tra nhanh:")
	for idx, pair in enumerate(pref_data[:preview_count], start=1):
	print(
	f" [{idx:02d}] type={pair.get('answer_type')} \| "
	f"Q={pair.get('question')} \| chosen={pair.get('chosen')} \| rejected={pair.get('rejected')}"
	)
	return pref_data

	class MedicalDPOTrainer:
	"""
	Trainer cho Direct Preference Optimization (DPO) trên LLaVA-Med.
	Giúp tối ưu hóa mô hình dựa trên các cặp preference dữ liệu y tế.
	"""
	def __init__(self, model, reference_model, train_loader, optimizer, device, config):
	self.model = model
	self.reference_model = reference_model
	self.train_loader = train_loader
	self.optimizer = optimizer
	self.device = device
	self.config = config
	self.beta = config.get('dpo_beta', 0.1)

	def get_log_probs(self, logits, labels):
	"""
	Tính log probabilities cho các sequence.
	logits: [batch, seq_len, vocab]
	labels: [batch, seq_len]
	"""
	# Shift logits và labels để khớp (next token prediction)
	log_probs = F.log_softmax(logits, dim=-1)
	# Lấy log prob của các token đúng
	per_token_logps = torch.gather(log_probs, dim=2, index=labels.unsqueeze(2)).squeeze(2)
	# Chỉ lấy các token không phải padding (giả định mask > 0)
	return (per_token_logps * (labels != 0)).sum(-1)

	def compute_loss(self, policy_chosen_logps, policy_rejected_logps,
	reference_chosen_logps, reference_rejected_logps):
	"""
	Tính DPO loss theo công thức: -log(sigmoid(beta * (log_ratio_chosen - log_ratio_rejected)))
	"""
	pi_logratios = policy_chosen_logps - policy_rejected_logps
	ref_logratios = reference_chosen_logps - reference_rejected_logps

	logits = pi_logratios - ref_logratios
	loss = -F.logsigmoid(self.beta * logits).mean()

	# Thêm các chỉ số để theo dõi (rewards)
	chosen_rewards = self.beta * (policy_chosen_logps - reference_chosen_logps).detach()
	rejected_rewards = self.beta * (policy_rejected_logps - reference_rejected_logps).detach()

	return loss, chosen_rewards, rejected_rewards

	def train(self, epochs=3):
	print(f"[INFO] Bắt đầu huấn luyện DPO (beta={self.beta})...")
	self.model.train()
	self.reference_model.eval()
	# Freeze reference model để tiết kiệm VRAM (Quan trọng cho T4)
	for param in self.reference_model.parameters():
	param.requires_grad_(False)

	print(f"[INFO] DPO Trainer Ready ({self.device})")

	for epoch in range(epochs):
	self.model.train()
	total_loss = 0.0 # Đã thêm dòng khởi tạo total_loss tại đây
	pbar = tqdm(self.train_loader, desc=f"DPO Epoch {epoch+1}")

	for batch in pbar:
	images = batch['image'].to(self.device)
	chosen_ids = batch['chosen_ids'].to(self.device)
	rejected_ids = batch['rejected_ids'].to(self.device)

	# Tính Logits cho Chosen và Rejected (Sử dụng Duck Typing/Safe Forward)
	try:
	# Case: LLaVA-style multimodal model
	outputs_w = self.model(input_ids=chosen_ids, pixel_values=images, labels=chosen_ids)
	outputs_l = self.model(input_ids=rejected_ids, pixel_values=images, labels=rejected_ids)
	logits_w = outputs_w.logits
	logits_l = outputs_l.logits
	except Exception:
	# Fallback: Modular model (A1/A2 style)
	_, logits_w = self.model(images, chosen_ids)
	_, logits_l = self.model(images, rejected_ids)

	# 2. Forward Reference Model (No Grad)
	with torch.no_grad():
	try:
	# Multimodal case
	outputs_ref_w = self.reference_model(input_ids=chosen_ids, pixel_values=images, labels=chosen_ids)
	outputs_ref_l = self.reference_model(input_ids=rejected_ids, pixel_values=images, labels=rejected_ids)
	ref_logits_w = outputs_ref_w.logits
	ref_logits_l = outputs_ref_l.logits
	except Exception:
	# Modular case
	_, ref_logits_w = self.reference_model(images, chosen_ids)
	_, ref_logits_l = self.reference_model(images, rejected_ids)

	# 3. Tính log probs
	logps_w = self.get_log_probs(logits_w, chosen_ids)
	logps_l = self.get_log_probs(logits_l, rejected_ids)
	ref_logps_w = self.get_log_probs(ref_logits_w, chosen_ids)
	ref_logps_l = self.get_log_probs(ref_logits_l, rejected_ids)

	# 4. Tính Loss
	loss, _, _ = self.compute_loss(logps_w, logps_l, ref_logps_w, ref_logps_l)

	# 5. Backward
	self.optimizer.zero_grad()
	loss.backward()
	self.optimizer.step()

	total_loss += loss.item()
	pbar.set_postfix({"loss": loss.item()})

	print(f"Epoch {epoch+1} \| DPO Loss: {total_loss/len(self.train_loader):.4f}")