Spaces:

SpringWang08
/

Medical-VQA

Paused

App Files Files Community

Medical-VQA / train_medical.py

SpringWang08

Deploy Gradio notebook-style Medical VQA app

5551585 verified 4 days ago

raw

history blame contribute delete

72.5 kB

	import wandb
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	from torch.utils.data import DataLoader, random_split
	from transformers import AutoTokenizer
	import yaml
	import argparse
	import os
	import random
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

	# [Bypass CVE-2025-32434] Bỏ qua yêu cầu nâng cấp PyTorch 2.6 của transformers
	import transformers.utils.import_utils
	transformers.utils.import_utils.check_torch_load_is_safe = lambda: None
	import transformers.modeling_utils
	transformers.modeling_utils.check_torch_load_is_safe = lambda: None

	# [Bypass FSDPModule Error] Sửa lỗi thư viện trl import FSDPModule trên PyTorch cũ
	import torch.distributed.fsdp as fsdp
	if not hasattr(fsdp, "FSDPModule"):
	fsdp.FSDPModule = fsdp.FullyShardedDataParallel

	import csv
	import json
	from datetime import datetime
	from pathlib import Path
	from PIL import Image

	from datasets import load_dataset
	# Import các thành phần từ thư mục src
	from src.models.medical_vqa_model import MedicalVQAModelA
	from src.models.multimodal_vqa import MultimodalVQA
	from src.utils.visualization import MedicalImageTransform as MedicalTransform
	from src.data.medical_dataset import MedicalVQADataset
	from src.utils.text_utils import get_target_answer, normalize_answer, postprocess_answer


	def build_training_arguments(training_arguments_cls, **kwargs):
	"""Create TrainingArguments across transformers versions."""
	if "evaluation_strategy" in kwargs and "eval_strategy" not in kwargs:
	alias_kwargs = dict(kwargs)
	alias_kwargs["eval_strategy"] = alias_kwargs.pop("evaluation_strategy")
	try:
	return training_arguments_cls(**alias_kwargs)
	except TypeError as exc:
	if "eval_strategy" not in str(exc):
	raise

	return training_arguments_cls(**kwargs)


	def vqa_collate_fn(batch):
	"""Hàm gom batch tùy chỉnh để xử lý ảnh PIL và raw text."""
	elem = batch[0]
	collated = {}
	for key in elem.keys():
	if key in ['image', 'input_ids', 'attention_mask', 'label_closed', 'target_ids', 'chosen_ids', 'rejected_ids']:
	collated[key] = torch.stack([item[key] for item in batch])
	else:
	# Giữ nguyên list cho PIL images và raw text
	collated[key] = [item[key] for item in batch]
	return collated


	def flatten_dict(data, parent_key="", sep="."):
	items = {}
	for key, value in data.items():
	new_key = f"{parent_key}{sep}{key}" if parent_key else str(key)
	if isinstance(value, dict):
	items.update(flatten_dict(value, new_key, sep=sep))
	elif isinstance(value, (list, tuple)):
	continue
	else:
	items[new_key] = value
	return items


	def create_history_dir(base_log_dir, variant):
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	history_dir = os.path.join(base_log_dir, "history", variant, timestamp)
	os.makedirs(history_dir, exist_ok=True)
	return history_dir


	def save_history_records(history_dir, records):
	os.makedirs(history_dir, exist_ok=True)
	json_path = os.path.join(history_dir, "history.json")
	csv_path = os.path.join(history_dir, "history.csv")

	with open(json_path, "w", encoding="utf-8") as f:
	json.dump(records, f, ensure_ascii=False, indent=2)

	flat_rows = [flatten_dict(record) for record in records]
	if flat_rows:
	fieldnames = sorted({key for row in flat_rows for key in row.keys()})
	with open(csv_path, "w", encoding="utf-8", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames)
	writer.writeheader()
	writer.writerows(flat_rows)


	def select_best_adapter_checkpoint(checkpoint_root: str):
	checkpoint_root = Path(checkpoint_root)
	if not checkpoint_root.exists():
	raise FileNotFoundError(f"Không tìm thấy thư mục checkpoint: {checkpoint_root}")

	def _is_valid_adapter_checkpoint(path: Path) -> bool:
	adapter_cfg = path / "adapter_config.json"
	adapter_weights = path / "adapter_model.safetensors"
	if not adapter_cfg.exists() or not adapter_weights.exists():
	return False
	try:
	from safetensors import safe_open
	with safe_open(str(adapter_weights), framework="pt", device="cpu") as f:
	return len(f.keys()) > 0
	except Exception as exc:
	print(f"[WARN] Bỏ qua checkpoint lỗi {path}: {exc}")
	return False

	checkpoint_dirs = sorted(
	p for p in checkpoint_root.glob("checkpoint-*")
	if _is_valid_adapter_checkpoint(p)
	)
	if not checkpoint_dirs:
	raise FileNotFoundError(f"Không có adapter checkpoint hợp lệ trong {checkpoint_root}")

	for state_file in sorted(checkpoint_root.glob("checkpoint-*/trainer_state.json"), reverse=True):
	try:
	state = json.loads(state_file.read_text(encoding="utf-8"))
	except (OSError, json.JSONDecodeError):
	continue

	best_path = state.get("best_model_checkpoint")
	if best_path:
	best_dir = Path(best_path.replace("./", ""))
	if not best_dir.is_absolute():
	best_dir = Path.cwd() / best_dir
	if _is_valid_adapter_checkpoint(best_dir):
	return best_dir.resolve()

	return checkpoint_dirs[-1].resolve()


	def build_dpo_instruction_prompt(question: str, max_words: int = 10) -> str:
	question = str(question or "").strip()
	instruction = (
	"Chi tra loi bang tieng Viet. "
	"Khong dung tieng Anh. "
	"Khong lap lai cau hoi. "
	"Khong mo ta hinh anh chung chung. "
	f"Chi tra loi truc tiep dap an, toi da {max_words} tu."
	)
	return f"USER: <image>\n{question}\n{instruction} ASSISTANT:"


	def load_latest_variant_metrics(history_root: str, variant: str) -> dict \| None:
	variant_dir = Path(history_root) / variant
	if not variant_dir.exists():
	return None
	history_files = sorted(variant_dir.glob("*/history.json"))
	if not history_files:
	return None
	for history_file in reversed(history_files):
	try:
	records = json.loads(history_file.read_text(encoding="utf-8"))
	except (OSError, json.JSONDecodeError):
	continue
	if records:
	return records[-1]
	return None


	def evaluate_dpo_acceptance(b2_metrics: dict \| None, dpo_metrics: dict) -> dict:
	if not b2_metrics:
	return {
	"status": "unknown",
	"reason": "missing_b2_metrics",
	"summary": "Khong tim thay metrics B2 de doi chieu.",
	}

	def pct_delta(key: str) -> float \| None:
	b2_val = b2_metrics.get(key)
	dpo_val = dpo_metrics.get(key)
	if b2_val is None or dpo_val is None:
	return None
	return (dpo_val - b2_val) * 100.0

	deltas = {
	"accuracy": pct_delta("val_accuracy_normalized"),
	"f1": pct_delta("val_f1_normalized"),
	"bleu4": pct_delta("val_bleu4_normalized"),
	"closed_acc": pct_delta("val_closed_accuracy"),
	"open_semantic": pct_delta("val_open_semantic"),
	"open_bert": pct_delta("val_open_bertscore"),
	}
	failed_drop = any(
	delta is not None and delta < -1.0
	for delta in (deltas["accuracy"], deltas["f1"], deltas["bleu4"])
	)
	closed_ok = (
	b2_metrics.get("val_closed_accuracy") is not None
	and dpo_metrics.get("val_closed_accuracy") is not None
	and dpo_metrics["val_closed_accuracy"] >= b2_metrics["val_closed_accuracy"]
	)
	open_ok = (
	b2_metrics.get("val_open_semantic") is not None
	and dpo_metrics.get("val_open_semantic") is not None
	and b2_metrics.get("val_open_bertscore") is not None
	and dpo_metrics.get("val_open_bertscore") is not None
	and dpo_metrics["val_open_semantic"] >= b2_metrics["val_open_semantic"]
	and (dpo_metrics["val_open_bertscore"] - b2_metrics["val_open_bertscore"]) * 100.0 >= -0.3
	)
	accepted = (not failed_drop) and (closed_ok or open_ok)
	def _fmt(delta: float \| None) -> str:
	return "N/A" if delta is None else f"{delta:.2f}"
	summary = (
	f"DPO vs B2 deltas (pp): Acc={_fmt(deltas['accuracy'])} \| F1={_fmt(deltas['f1'])} \| "
	f"BLEU={_fmt(deltas['bleu4'])} \| Closed={_fmt(deltas['closed_acc'])} \| "
	f"OpenSem={_fmt(deltas['open_semantic'])} \| OpenBERT={_fmt(deltas['open_bert'])}"
	)
	return {
	"status": "accepted" if accepted else "failed",
	"reason": "criteria_met" if accepted else "metric_drop_or_no_gain",
	"summary": summary,
	"deltas_pp": deltas,
	"closed_ok": closed_ok,
	"open_ok": open_ok,
	}


	def evaluate_refinement_acceptance(base_metrics: dict \| None, rl_metrics: dict) -> dict:
	return evaluate_dpo_acceptance(base_metrics, rl_metrics)


	def sanitize_dpo_completion(question: str, answer: str, max_words: int = 10) -> str:
	question_norm = normalize_answer(question)
	answer_norm = postprocess_answer(answer, max_words=max_words)

	if answer_norm in {"yes", "có"}:
	return "có"
	if answer_norm in {"no", "không"}:
	return "không"

	is_closed = any(
	pattern in question_norm
	for pattern in ["bình thường", "bat thuong", "normal", "abnormal"]
	) or question_norm.endswith(" không") or " có " in f" {question_norm} "

	if is_closed:
	if any(token in answer_norm for token in ["không", "no", "not normal", "abnormal"]):
	return "không"
	if any(token in answer_norm for token in ["có", "yes", "bình thường", "normal", "present", "detected"]):
	return "có"

	return answer_norm


	def resolve_dpo_image(item: dict, hf_train_data=None, image_dir: str \| None = None):
	source_idx = item.get("source_idx")
	if hf_train_data is not None and source_idx is not None and 0 <= int(source_idx) < len(hf_train_data):
	img = hf_train_data[int(source_idx)].get("image")
	if img is not None and getattr(img, "mode", None) != "RGB":
	img = img.convert("RGB")
	return img

	image_name = item.get("image")
	if image_name and image_dir:
	img_path = os.path.join(image_dir, image_name)
	if os.path.exists(img_path):
	return Image.open(img_path).convert("RGB")
	return None


	def infer_closed_answer_type(item: dict, answer: str \| None = None) -> bool:
	answer_norm = normalize_answer(answer if answer is not None else get_target_answer(item))
	answer_type = str(item.get("answer_type", "")).strip().upper()
	label_closed = item.get("label_closed", None)
	if answer_type == "CLOSED" or label_closed in (0, 1):
	return True
	return answer_norm in {"có", "không", "yes", "no"}


	def move_model_batch_to_device(batch: dict, device: torch.device) -> dict:
	moved = {}
	for key, value in batch.items():
	if hasattr(value, "to"):
	moved[key] = value.to(device)
	else:
	moved[key] = value
	return moved


	def build_multimodal_completion_batch(processor, prompts, completions, images, max_length=None):
	full_texts = [f"{prompt}{completion}" for prompt, completion in zip(prompts, completions)]
	batch = processor(
	text=full_texts,
	images=images,
	return_tensors="pt",
	padding=True,
	truncation=False,
	)
	prompt_batch = processor(
	text=prompts,
	images=images,
	return_tensors="pt",
	padding=True,
	truncation=False,
	)

	completion_mask = torch.zeros_like(batch["input_ids"], dtype=torch.long)
	prompt_lengths = prompt_batch["attention_mask"].sum(dim=1)
	for i, prompt_len in enumerate(prompt_lengths.tolist()):
	token_positions = batch["attention_mask"][i].nonzero(as_tuple=True)[0]
	completion_mask[i, token_positions[prompt_len:]] = 1

	if max_length is not None and batch["input_ids"].shape[1] > max_length:
	batch["input_ids"] = batch["input_ids"][:, :max_length]
	batch["attention_mask"] = batch["attention_mask"][:, :max_length]
	completion_mask = completion_mask[:, :max_length]
	for key in ("token_type_ids", "mm_token_type_ids"):
	if key in batch:
	batch[key] = batch[key][:, :max_length]

	return batch, completion_mask


	def compute_masked_sequence_logprobs(model, batch, completion_mask):
	model_inputs = move_model_batch_to_device(batch, next(model.parameters()).device)
	completion_mask = completion_mask.to(model_inputs["input_ids"].device)
	outputs = model(**model_inputs)
	logits = outputs.logits[:, :-1, :]
	labels = model_inputs["input_ids"][:, 1:]
	token_mask = completion_mask[:, 1:].float()

	log_probs = F.log_softmax(logits, dim=-1)
	token_log_probs = log_probs.gather(-1, labels.unsqueeze(-1)).squeeze(-1)
	masked_log_probs = token_log_probs * token_mask
	denom = token_mask.sum(dim=1).clamp_min(1.0)
	seq_log_probs = masked_log_probs.sum(dim=1) / denom

	probs = log_probs.exp()
	token_entropy = -(probs * log_probs).sum(dim=-1)
	seq_entropy = (token_entropy * token_mask).sum(dim=1) / denom
	return seq_log_probs, seq_entropy


	def compute_single_open_reward(pred: str, ref: str) -> tuple[float, dict]:
	from src.utils.metrics import compute_exact_match, compute_f1, compute_rouge_l
	from src.utils import metrics as metrics_module

	norm_pred = normalize_answer(pred) or "."
	norm_ref = normalize_answer(ref) or "."
	exact = compute_exact_match(norm_pred, norm_ref)
	f1 = compute_f1(norm_pred, norm_ref)
	rouge_l = compute_rouge_l(norm_pred, norm_ref)

	bert = 0.0
	scorer = getattr(metrics_module, "bert_scorer", None)
	if scorer is not None:
	try:
	_, _, bert_f1 = scorer.score([norm_pred], [norm_ref])
	bert = float(bert_f1.mean().item())
	except Exception:
	bert = 0.0

	blended = (0.55 * bert) + (0.30 * f1) + (0.10 * rouge_l) + (0.05 * exact)
	reward = (2.0 * blended) - 1.0
	return reward, {
	"bert": bert,
	"f1": f1,
	"rouge_l": rouge_l,
	"exact": exact,
	"blended": blended,
	}

	def train(args):
	# 1. Load Cấu hình
	with open(args.config, 'r', encoding='utf-8') as f:
	config = yaml.safe_load(f)

	# ── WandB Setup ──────────────────────────────────────────────────────────
	_wandb_cfg = config.get("wandb", {})
	_use_wandb = bool(os.environ.get("WANDB_API_KEY") or os.environ.get("WANDB_MODE"))

	if _use_wandb:
	_api_key = os.environ.get("WANDB_API_KEY")
	if _api_key:
	wandb.login(key=_api_key)

	# Offline mode: set WANDB_MODE=offline hoặc config wandb.offline: true
	_offline = _wandb_cfg.get("offline", False) or \
	os.environ.get("WANDB_MODE", "").lower() == "offline"
	if _offline:
	os.environ["WANDB_MODE"] = "offline"
	print("[INFO] WandB chạy ở chế độ OFFLINE (sync sau bằng: wandb sync)")

	# Tags theo variant từ YAML
	_tags = _wandb_cfg.get("tags", {}).get(args.variant, [])

	# Rich config ghi đầy đủ thông tin experiment
	_run_config = {
	# ── Model architecture ──
	"variant": args.variant,
	"decoder_type": config["model_a"].get("decoder_type"),
	"image_encoder": config["model_a"].get("image_encoder"),
	"text_encoder": config["model_a"].get("text_encoder"),
	"hidden_size": config["model_a"].get("hidden_size"),
	"transformer_heads": config["model_a"].get("transformer_heads"),
	"transformer_ff_dim": config["model_a"].get("transformer_ff_dim"),
	"transformer_layers": config["model_a"].get("transformer_decoder_layers"),
	"norm_first": config["model_a"].get("transformer_norm_first"),
	"freeze_phobert_layers": config["model_a"].get("freeze_phobert_layers"),
	# ── Training ──
	"learning_rate": config["train"].get("learning_rate"),
	"phobert_lr": config["train"].get("phobert_lr"),
	"vision_lr": config["train"].get("vision_lr"),
	"batch_size": config["train"].get("batch_size"),
	"grad_accum_steps": config["train"].get("gradient_accumulation_steps"),
	"effective_batch": config["train"].get("batch_size", 32) *
	config["train"].get("gradient_accumulation_steps", 1),
	"label_smoothing": config["train"].get("label_smoothing"),
	"open_loss_weight": config["train"].get("open_loss_weight"),
	"warmup_epochs": config["train"].get("warmup_epochs"),
	"scheduler": config["train"].get("scheduler"),
	"patience": config["train"].get("patience"),
	"use_amp": config["train"].get("use_amp"),
	# ── Data ──
	"dataset": config["data"].get("dataset_name"),
	"max_question_len": config["data"].get("max_question_len"),
	"max_answer_len": config["data"].get("max_answer_len"),
	# ── Eval ──
	"beam_width": config["eval"].get("beam_width_a") if args.variant in ("A1", "A2")
	else config["eval"].get("beam_width_b"),
	}

	# Thêm hardware info
	if torch.cuda.is_available():
	_run_config["gpu_name"] = torch.cuda.get_device_name(0)
	_run_config["gpu_count"] = torch.cuda.device_count()
	_run_config["vram_gb"] = round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1)

	_entity = _wandb_cfg.get("entity") or None # None = WandB dùng default entity

	wandb.init(
	project=_wandb_cfg.get("project", "MedicalVQA-Vietnam"),
	entity=_entity,
	name=f"{args.variant}-{datetime.now().strftime('%m%d-%H%M')}",
	group=_wandb_cfg.get("group", "DL-Final"),
	job_type=_wandb_cfg.get("job_type", "train"),
	tags=_tags,
	notes=_wandb_cfg.get("notes", ""),
	config=_run_config,
	save_code=_wandb_cfg.get("save_code", True),
	reinit="finish_previous", # Kết thúc run trước nếu chạy nhiều variant liên tiếp
	)
	print(f"[INFO] ✅ WandB run: {wandb.run.url}")

	# Watch model gradients nếu được bật
	if _wandb_cfg.get("watch_model", False):
	# model chưa khởi tạo ở đây — hook sẽ được gọi sau khi model được tạo
	os.environ["_WANDB_WATCH_PENDING"] = "1"
	else:
	print("[INFO] WandB không được cấu hình (thiếu WANDB_API_KEY) — bỏ qua logging.")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"[INFO] Thiết bị sử dụng: {device}")
	history_dir = create_history_dir(config.get("log_dir", "logs/medical_vqa"), args.variant)
	print(f"[INFO] Lưu training history tại: {history_dir}")

	# 2. Tokenizer & Dataset
	tokenizer = AutoTokenizer.from_pretrained(config['model_a']['phobert_model'])
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token
	transform = MedicalTransform(size=config['data']['image_size'])
	answer_max_words = int(config['data'].get('answer_max_words', 10))

	# Nạp dữ liệu từ HuggingFace Hub hoặc cục bộ
	hf_repo = config['data'].get('hf_dataset')
	use_hf_splits = bool(config['data'].get('use_hf_splits', True))
	if hf_repo and use_hf_splits:
	print(f"[INFO] Đang tải dữ liệu từ Hub: {hf_repo}")
	dataset_dict = load_dataset(hf_repo)

	if args.debug:
	print("[WARNING] DEBUG MODE: Chỉ lấy 20 mẫu để chạy thử.")
	dataset_dict['train'] = dataset_dict['train'].select(range(min(20, len(dataset_dict['train']))))
	config['train']['epochs'] = 2
	config['train']['batch_size'] = 2

	train_ds = MedicalVQADataset(
	hf_dataset=dataset_dict['train'],
	tokenizer=tokenizer,
	transform=transform,
	max_seq_len=config['data']['max_question_len'],
	max_ans_len=config['data']['max_answer_len'],
	answer_max_words=answer_max_words
	)
	val_ds = MedicalVQADataset(
	hf_dataset=dataset_dict['validation'],
	tokenizer=tokenizer,
	transform=transform,
	max_seq_len=config['data']['max_question_len'],
	max_ans_len=config['data']['max_answer_len'],
	answer_max_words=answer_max_words
	)
	else:
	vqa_path = config['data']['vqa_json']
	print(f"[INFO] Đang tải dữ liệu cục bộ từ: {vqa_path}")
	full_dataset = MedicalVQADataset(
	json_path=vqa_path,
	image_dir=config['data']['image_dir'],
	tokenizer=tokenizer,
	transform=transform,
	max_seq_len=config['data']['max_question_len'],
	max_ans_len=config['data']['max_answer_len'],
	answer_max_words=answer_max_words
	)
	train_size = int(0.8 * len(full_dataset))
	val_size = len(full_dataset) - train_size
	train_ds, val_ds = random_split(full_dataset, [train_size, val_size])

	train_loader = DataLoader(
	train_ds,
	batch_size=config['train']['batch_size'],
	shuffle=True,
	collate_fn=vqa_collate_fn,
	num_workers=config['train'].get('num_workers', 0),
	pin_memory=config['train'].get('pin_memory', False)
	)
	val_loader = DataLoader(
	val_ds,
	batch_size=config['train']['eval_batch_size'] if 'eval_batch_size' in config['train'] else 8,
	collate_fn=vqa_collate_fn
	)

	# 3. Khởi tạo Mô hình dựa trên Variant
	if args.variant in ['A1', 'A2']:
	decoder_type = "lstm" if args.variant == 'A1' else "transformer"
	model = MedicalVQAModelA(
	decoder_type=decoder_type,
	vocab_size=len(tokenizer),
	hidden_size=config['model_a'].get('hidden_size', 768),
	phobert_model=config['model_a'].get('phobert_model', "vinai/phobert-base")
	).to(device)

	# Log model param count lên WandB
	if wandb.run:
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	wandb.config.update({
	"total_params_M": round(total_params / 1e6, 2),
	"trainable_params_M": round(trainable_params / 1e6, 2),
	})
	print(f"[INFO] Tổng params: {total_params/1e6:.1f}M \| Trainable: {trainable_params/1e6:.1f}M")
	# wandb.watch: chỉ bật nếu log_gradients: true
	if _wandb_cfg.get("log_gradients", False):
	wandb.watch(model, log="gradients",
	log_freq=_wandb_cfg.get("log_freq", 50))

	# Thiết lập Optimizer với Differential Learning Rate
	optimizer = optim.AdamW([
	{'params': model.image_encoder.parameters(), 'lr': float(config['train']['vision_lr'])},
	{'params': model.text_encoder.parameters(), 'lr': float(config['train']['phobert_lr'])},
	{'params': model.fusion.parameters(), 'lr': float(config['train']['learning_rate'])},
	{'params': model.decoder.parameters(), 'lr': float(config['train']['learning_rate'])}
	])

	# [CRITICAL FIX] Dùng Cosine Schedule với Warmup, step theo batch thay vì epoch
	from transformers import get_cosine_schedule_with_warmup
	# Use a_epochs for Direction A models (A1, A2), otherwise use default epochs
	if args.variant in ['A1', 'A2']:
	epochs = config['train'].get('a_epochs', config['train']['epochs'])
	else:
	epochs = config['train']['epochs']
	warmup_epochs = config['train'].get('warmup_epochs', 5)
	accumulation_steps = config['train'].get('gradient_accumulation_steps', 2)
	total_steps = epochs * len(train_loader) // max(accumulation_steps, 1)
	warmup_steps = warmup_epochs * len(train_loader) // max(accumulation_steps, 1)

	scheduler = get_cosine_schedule_with_warmup(
	optimizer,
	num_warmup_steps=warmup_steps,
	num_training_steps=total_steps
	)
	# Khởi tạo Trainer với pad_token_id và beam_width từ config
	beam_width = config['eval'].get('beam_width_a', 5)
	from src.engine.trainer import MedicalVQATrainer
	trainer = MedicalVQATrainer(
	model=model,
	train_loader=train_loader,
	val_loader=val_loader,
	optimizer=optimizer,
	scheduler=scheduler,
	device=device,
	config={
	**config,
	'variant': args.variant,
	'history_dir': history_dir,
	# Pass tunable open-loss weight so trainer doesn't use hardcoded value
	'open_loss_weight': config['train'].get('open_loss_weight', 2.0),
	},
	pad_token_id=tokenizer.pad_token_id,
	beam_width=beam_width
	)
	print(f"[INFO] Beam Width cho Hướng A: {beam_width}")

	print(f"[INFO] Bắt đầu huấn luyện cấu hình {args.variant} ({epochs} epochs)...")
	trainer.train(epochs, tokenizer=tokenizer)
	if wandb.run:
	wandb.finish()
	return

	elif args.variant == 'PPO':
	from src.engine.medical_eval import evaluate_multimodal_vqa

	ppo_cfg = config.get('ppo', {})
	ppo_answer_max_words = int(ppo_cfg.get('max_answer_words', min(answer_max_words, 6)))
	wrapper = MultimodalVQA(
	model_id=config['model_b']['model_name'],
	lora_r=int(config['model_b'].get('lora_r', 16)),
	lora_alpha=int(config['model_b'].get('lora_alpha', 32)),
	lora_dropout=float(config['model_b'].get('lora_dropout', 0.05)),
	lora_target_modules=config['model_b'].get('lora_target_modules'),
	)
	b2_checkpoint = select_best_adapter_checkpoint(config['train'].get('b2_output_dir', './checkpoints/B2'))
	print(f"[INFO] PPO sẽ khởi tạo từ B2 checkpoint: {b2_checkpoint}")
	model, processor = wrapper.load_model(adapter_path=str(b2_checkpoint), is_trainable=True)

	if not ppo_cfg.get('train_mlp_lora', False):
	frozen_lora = 0
	for name, param in model.named_parameters():
	if "lora_" in name and any(proj in name for proj in ("gate_proj", "up_proj", "down_proj")):
	param.requires_grad = False
	frozen_lora += param.numel()
	print(f"[INFO] PPO đang freeze LoRA MLP để giảm VRAM: {frozen_lora:,} tham số")
	model.print_trainable_parameters()

	def _build_ppo_source():
	if hf_repo:
	return dataset_dict['train'], dataset_dict['train']
	if hasattr(train_ds, "dataset") and hasattr(train_ds.dataset, "data"):
	subset_indices = getattr(train_ds, "indices", list(range(len(train_ds.dataset.data))))
	local_items = [train_ds.dataset.data[i] for i in subset_indices]
	return local_items, None
	raise ValueError("Khong the truy cap raw train data de tao PPO rollout set.")

	def _prepare_ppo_records(raw_items, num_samples: int, closed_ratio: float):
	closed_records = []
	open_records = []
	for idx in range(len(raw_items)):
	item = raw_items[idx]
	question = str(item.get("question_vi", item.get("question", ""))).strip()
	target = get_target_answer(item, max_words=ppo_answer_max_words)
	if not question or not target:
	continue
	record = {
	"question": question,
	"target": target,
	"source_idx": idx,
	"image": item.get("image_name"),
	"is_closed": infer_closed_answer_type(item, target),
	}
	if record["is_closed"]:
	closed_records.append(record)
	else:
	open_records.append(record)

	rng = random.Random(int(config.get("seed", 42)))
	rng.shuffle(closed_records)
	rng.shuffle(open_records)

	target_closed = min(len(closed_records), int(round(num_samples * closed_ratio)))
	target_open = min(len(open_records), max(0, num_samples - target_closed))

	selected = closed_records[:target_closed] + open_records[:target_open]
	rng.shuffle(selected)
	return selected

	raw_train_source, hf_train_source = _build_ppo_source()
	ppo_records = _prepare_ppo_records(
	raw_train_source,
	num_samples=int(ppo_cfg.get('num_samples', 192)),
	closed_ratio=float(ppo_cfg.get('closed_ratio', 0.5)),
	)
	if not ppo_records:
	raise ValueError("Khong tao duoc PPO rollout set hop le.")
	print(f"[INFO] PPO rollout set: {len(ppo_records)} mau")

	trainable_params = [param for param in model.parameters() if param.requires_grad]
	optimizer = optim.AdamW(
	trainable_params,
	lr=float(ppo_cfg.get('learning_rate', 5.0e-7)),
	weight_decay=float(ppo_cfg.get('weight_decay', 0.0)),
	)
	rollout_batch_size = max(1, int(ppo_cfg.get('rollout_batch_size', 2)))
	total_updates = max(1, (len(ppo_records) + rollout_batch_size - 1) // rollout_batch_size)
	scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_updates)

	ppo_history = []
	eos = processor.tokenizer.eos_token or ""
	max_seq_length = max(int(config['train'].get('dpo_max_length', 768)), 768)
	grad_clip = float(config['train'].get('grad_clip', 1.0))
	entropy_coef = float(ppo_cfg.get('entropy_coef', 0.001))
	clip_range = float(ppo_cfg.get('clip_range', 0.2))
	max_new_tokens = int(ppo_cfg.get('max_new_tokens', 12))
	temperature = float(ppo_cfg.get('temperature', 0.8))
	top_p = float(ppo_cfg.get('top_p', 0.9))
	closed_positive = float(ppo_cfg.get('closed_positive_reward', 1.0))
	closed_negative = float(ppo_cfg.get('closed_negative_reward', -1.0))

	print("[INFO] Bắt đầu huấn luyện PPO-style refinement...")
	model.train()
	for update_idx in range(total_updates):
	batch_records = ppo_records[update_idx * rollout_batch_size:(update_idx + 1) * rollout_batch_size]
	prompts, images, questions, targets, closed_flags = [], [], [], [], []
	for record in batch_records:
	image = resolve_dpo_image(
	record,
	hf_train_data=hf_train_source,
	image_dir=config['data'].get('image_dir'),
	)
	if image is None:
	continue
	prompts.append(build_dpo_instruction_prompt(record["question"], max_words=ppo_answer_max_words))
	images.append(image)
	questions.append(record["question"])
	targets.append(record["target"])
	closed_flags.append(record["is_closed"])

	if not prompts:
	continue

	generation_inputs = processor(
	text=prompts,
	images=images,
	return_tensors="pt",
	padding=True,
	)
	generation_inputs = move_model_batch_to_device(generation_inputs, next(model.parameters()).device)
	if "pixel_values" in generation_inputs:
	generation_inputs["pixel_values"] = generation_inputs["pixel_values"].to(torch.bfloat16)

	with torch.no_grad():
	generated_ids = model.generate(
	**generation_inputs,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=temperature,
	top_p=top_p,
	num_beams=1,
	pad_token_id=processor.tokenizer.pad_token_id,
	eos_token_id=processor.tokenizer.eos_token_id,
	)

	prompt_token_len = generation_inputs["input_ids"].shape[1]
	generated_texts = processor.batch_decode(
	generated_ids[:, prompt_token_len:],
	skip_special_tokens=True,
	)

	sampled_answers = []
	rewards = []
	reward_breakdown = []
	for question, target, is_closed, raw_output in zip(questions, targets, closed_flags, generated_texts):
	pred = sanitize_dpo_completion(question, raw_output, max_words=ppo_answer_max_words)
	if not pred:
	pred = "không" if is_closed else "không rõ"
	sampled_answers.append(pred)
	if is_closed:
	reward = closed_positive if normalize_answer(pred) == normalize_answer(target) else closed_negative
	rewards.append(reward)
	reward_breakdown.append({"exact": float(reward > 0), "reward": reward})
	else:
	reward, details = compute_single_open_reward(pred, target)
	rewards.append(reward)
	reward_breakdown.append(details \| {"reward": reward})

	completion_texts = [f" {pred}{eos}" for pred in sampled_answers]
	rollout_batch, rollout_mask = build_multimodal_completion_batch(
	processor,
	prompts,
	completion_texts,
	images,
	max_length=max_seq_length,
	)

	with torch.no_grad():
	old_seq_log_probs, _ = compute_masked_sequence_logprobs(model, rollout_batch, rollout_mask)

	reward_tensor = torch.tensor(rewards, dtype=torch.float32, device=old_seq_log_probs.device)
	if reward_tensor.numel() > 1:
	advantages = reward_tensor - reward_tensor.mean()
	advantages = advantages / advantages.std(unbiased=False).clamp_min(1e-6)
	else:
	advantages = reward_tensor

	optimizer.zero_grad(set_to_none=True)
	new_seq_log_probs, entropy = compute_masked_sequence_logprobs(model, rollout_batch, rollout_mask)
	ratios = torch.exp(new_seq_log_probs - old_seq_log_probs.detach())
	clipped_ratios = torch.clamp(ratios, 1.0 - clip_range, 1.0 + clip_range)
	surrogate_1 = ratios * advantages
	surrogate_2 = clipped_ratios * advantages
	policy_loss = -torch.min(surrogate_1, surrogate_2).mean()
	entropy_bonus = entropy.mean()
	loss = policy_loss - (entropy_coef * entropy_bonus)
	loss.backward()
	torch.nn.utils.clip_grad_norm_(trainable_params, grad_clip)
	optimizer.step()
	scheduler.step()

	closed_rewards = [r for r, is_closed in zip(rewards, closed_flags) if is_closed]
	open_rewards = [r for r, is_closed in zip(rewards, closed_flags) if not is_closed]
	log_record = {
	"epoch": 1,
	"update": update_idx + 1,
	"train_loss": float(loss.detach().cpu().item()),
	"policy_loss": float(policy_loss.detach().cpu().item()),
	"entropy": float(entropy_bonus.detach().cpu().item()),
	"avg_reward": float(sum(rewards) / len(rewards)),
	"avg_closed_reward": float(sum(closed_rewards) / len(closed_rewards)) if closed_rewards else None,
	"avg_open_reward": float(sum(open_rewards) / len(open_rewards)) if open_rewards else None,
	"learning_rate": float(scheduler.get_last_lr()[0]),
	"sample_predictions": sampled_answers[:2],
	"sample_targets": targets[:2],
	"reward_breakdown": reward_breakdown[:2],
	}
	ppo_history.append(log_record)

	if wandb.run:
	wandb.log({
	"ppo/train_loss": log_record["train_loss"],
	"ppo/policy_loss": log_record["policy_loss"],
	"ppo/entropy": log_record["entropy"],
	"ppo/avg_reward": log_record["avg_reward"],
	"ppo/avg_closed_reward": log_record["avg_closed_reward"],
	"ppo/avg_open_reward": log_record["avg_open_reward"],
	"ppo/learning_rate": log_record["learning_rate"],
	"ppo/update": log_record["update"],
	})

	del generation_inputs, generated_ids
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	final_ppo_dir = Path("checkpoints/PPO/final_adapter")
	final_ppo_dir.mkdir(parents=True, exist_ok=True)
	model.save_pretrained(str(final_ppo_dir))
	processor.save_pretrained(str(final_ppo_dir))
	with open("checkpoints/medical_vqa_ppo_from.txt", "w", encoding="utf-8") as f:
	f.write(str(b2_checkpoint))

	print("[INFO] Đang chạy đánh giá nghiệm thu trên tập Validation cho PPO...")
	model.eval()
	metrics = evaluate_multimodal_vqa(
	model,
	val_loader,
	device,
	processor,
	beam_width=config['eval'].get('beam_width_b', 1),
	beam_width_closed=config['eval'].get('beam_width_b_closed', 1),
	beam_width_open=config['eval'].get('beam_width_b_open', config['eval'].get('beam_width_b', 1)),
	max_new_tokens_closed=config['eval'].get('max_new_tokens_b_closed', 4),
	max_new_tokens_open=config['eval'].get('max_new_tokens_b_open', answer_max_words + 6),
	generation_batch_size=config['eval'].get('generation_batch_size_b', 1),
	max_words=answer_max_words,
	variant='PPO'
	)

	closed_eval = metrics.get('closed_eval', {})
	open_eval = metrics.get('open_eval', {})
	ppo_history.append({
	"epoch": 1,
	"val_accuracy_normalized": metrics.get('accuracy_normalized'),
	"val_f1_normalized": metrics.get('f1_normalized'),
	"val_bleu4_normalized": metrics.get('bleu4_normalized'),
	"val_bert_score_raw": metrics.get('bert_score_raw'),
	"val_semantic_raw": metrics.get('semantic_raw'),
	"val_closed_accuracy": closed_eval.get('accuracy', 0),
	"val_closed_em": closed_eval.get('em', 0),
	"val_closed_f1": closed_eval.get('f1', 0),
	"val_open_semantic": open_eval.get('semantic', 0),
	"val_open_bertscore": open_eval.get('bert_score', 0),
	"val_open_f1": open_eval.get('f1', 0),
	"val_open_rouge_l": open_eval.get('rouge_l', 0),
	})

	b2_metrics = load_latest_variant_metrics(os.path.join(config['log_dir'], "history"), "B2")
	ppo_acceptance = evaluate_refinement_acceptance(b2_metrics, ppo_history[-1])
	ppo_history[-1]["ppo_acceptance"] = ppo_acceptance
	print(f"[INFO] {ppo_acceptance['summary']}")
	if ppo_acceptance["status"] == "accepted":
	print("[SUCCESS] PPO accepted: dat tieu chi refinement nhe tren B2.")
	elif ppo_acceptance["status"] == "failed":
	print("[WARN] PPO failed, keep B2. Khong khuyen nghi tiep tuc tuning them.")

	os.makedirs("checkpoints/PPO", exist_ok=True)
	with open("checkpoints/PPO/acceptance_summary.json", "w", encoding="utf-8") as f:
	json.dump(ppo_acceptance, f, ensure_ascii=False, indent=2)

	save_history_records(history_dir, ppo_history)
	print("[SUCCESS] Đã lưu checkpoint và metrics PPO.")
	return

	elif args.variant == 'DPO':
	from trl import DPOTrainer
	try:
	from trl import DPOConfig
	except ImportError:
	DPOConfig = None
	from transformers import TrainingArguments
	from datasets import Dataset as HFDataset
	import inspect

	dpo_answer_max_words = int(config.get('dpo', {}).get('max_answer_words', min(answer_max_words, 6)))
	wrapper = MultimodalVQA(
	model_id=config['model_b']['model_name'],
	lora_r=int(config['model_b'].get('lora_r', 16)),
	lora_alpha=int(config['model_b'].get('lora_alpha', 32)),
	lora_dropout=float(config['model_b'].get('lora_dropout', 0.05)),
	lora_target_modules=config['model_b'].get('lora_target_modules'),
	)
	explicit_b2_checkpoint = (
	config.get('train', {}).get('b2_checkpoint')
	or os.environ.get('B2_CHECKPOINT_PATH')
	)
	if explicit_b2_checkpoint:
	b2_checkpoint = Path(explicit_b2_checkpoint).expanduser().resolve()
	if not b2_checkpoint.exists():
	raise FileNotFoundError(f"Không tìm thấy B2 checkpoint được chỉ định: {b2_checkpoint}")
	print(f"[INFO] DPO sẽ khởi tạo từ B2 checkpoint chỉ định: {b2_checkpoint}")
	else:
	b2_checkpoint = select_best_adapter_checkpoint(config['train'].get('b2_output_dir', './checkpoints/B2'))
	print(f"[INFO] DPO sẽ khởi tạo từ B2 checkpoint: {b2_checkpoint}")
	try:
	model, processor = wrapper.load_model(adapter_path=str(b2_checkpoint), is_trainable=True)
	except Exception as exc:
	print(f"[WARNING] Không load được B2 checkpoint, fallback sang base LLaVA-Med + LoRA mới: {exc}")
	model, processor = wrapper.load_model(adapter_path=None, is_trainable=True)
	if not config['train'].get('dpo_train_mlp_lora', False):
	frozen_lora = 0
	for name, param in model.named_parameters():
	if "lora_" in name and any(proj in name for proj in ("gate_proj", "up_proj", "down_proj")):
	param.requires_grad = False
	frozen_lora += param.numel()
	print(f"[INFO] DPO đang freeze LoRA MLP để giảm VRAM: {frozen_lora:,} tham số")
	model.print_trainable_parameters()

	# Tạo/Load Preference Data
	pref_json = config.get('dpo', {}).get('preference_data', 'data/preference_data_slake.json')
	force_rebuild_pref = bool(config.get('dpo', {}).get('force_rebuild_preference_data', False))
	if force_rebuild_pref and os.path.exists(pref_json):
	print(f"[INFO] Dang xoa preference data cu de tao lai theo cau hinh hien tai: {pref_json}")
	os.remove(pref_json)

	if not os.path.exists(pref_json):
	print(f"[INFO] Chưa có preference data. Đang tự động tạo từ training data...")
	from src.engine.dpo_trainer import create_preference_data
	if hf_repo:
	raw_data = [{"question_vi": item["question_vi"], "answer_vi": get_target_answer(item, max_words=dpo_answer_max_words),
	"image_name": item.get("image_name"),
	"source_idx": i}
	for i, item in enumerate(dataset_dict['train'])]
	tmp_json = "data/tmp_train_for_dpo.json"
	os.makedirs("data", exist_ok=True)
	with open(tmp_json, 'w', encoding='utf-8') as f:
	json.dump(raw_data, f, ensure_ascii=False, indent=2)
	create_preference_data(
	tmp_json,
	pref_json,
	num_pairs=int(config.get('dpo', {}).get('num_pairs', 400)),
	closed_ratio=float(config.get('dpo', {}).get('closed_ratio', 0.6)),
	max_answer_words=dpo_answer_max_words,
	)
	else:
	create_preference_data(
	config['data']['vqa_json'],
	pref_json,
	num_pairs=int(config.get('dpo', {}).get('num_pairs', 400)),
	closed_ratio=float(config.get('dpo', {}).get('closed_ratio', 0.6)),
	max_answer_words=dpo_answer_max_words,
	)

	# Đọc file JSON preference data
	with open(pref_json, 'r', encoding='utf-8') as f:
	pref_data = json.load(f)

	if hf_repo and any("source_idx" not in item for item in pref_data):
	print("[INFO] Preference data cu khong co source_idx. Dang tao lai de giu lien ket image cho DPO...")
	from src.engine.dpo_trainer import create_preference_data
	raw_data = [{"question_vi": item["question_vi"], "answer_vi": get_target_answer(item, max_words=dpo_answer_max_words),
	"image_name": item.get("image_name"), "source_idx": i}
	for i, item in enumerate(dataset_dict['train'])]
	tmp_json = "data/tmp_train_for_dpo.json"
	with open(tmp_json, 'w', encoding='utf-8') as f:
	json.dump(raw_data, f, ensure_ascii=False, indent=2)
	create_preference_data(
	tmp_json,
	pref_json,
	num_pairs=int(config.get('dpo', {}).get('num_pairs', 400)),
	closed_ratio=float(config.get('dpo', {}).get('closed_ratio', 0.6)),
	max_answer_words=dpo_answer_max_words,
	)
	with open(pref_json, 'r', encoding='utf-8') as f:
	pref_data = json.load(f)

	# Chuẩn bị HF Dataset cho DPOTrainer (yêu cầu cột: prompt, chosen, rejected)
	prompts, chosens, rejecteds, images = [], [], [], []
	eos = processor.tokenizer.eos_token or ""
	filtered_pairs = 0
	for item in pref_data:
	q = item.get("question", "")
	chosen = sanitize_dpo_completion(q, item.get("chosen", ""), max_words=dpo_answer_max_words)
	rejected = sanitize_dpo_completion(q, item.get("rejected", ""), max_words=dpo_answer_max_words)
	image = resolve_dpo_image(
	item,
	hf_train_data=dataset_dict['train'] if hf_repo else None,
	image_dir=config['data'].get('image_dir'),
	)

	if not chosen or not rejected or chosen == rejected or image is None:
	filtered_pairs += 1
	continue

	prompts.append(build_dpo_instruction_prompt(q, max_words=dpo_answer_max_words))
	chosens.append(f" {chosen}{eos}")
	rejecteds.append(f" {rejected}{eos}")
	images.append(image)

	if not prompts:
	raise ValueError("Khong con cap preference hop le sau khi sanitize DPO data.")
	if filtered_pairs:
	print(f"[INFO] Da bo qua {filtered_pairs} cap preference khong hop le sau sanitize.")

	dpo_hf_dataset = HFDataset.from_dict({
	"prompt": prompts,
	"chosen": chosens,
	"rejected": rejecteds,
	"image": images,
	})

	class MultimodalDPODataCollator:
	def __init__(self, processor, max_length=None):
	self.processor = processor
	self.tokenizer = processor.tokenizer
	# LLaVA expands a single <image> placeholder into hundreds of visual tokens.
	# If max_length is too small, the processor truncates those tokens and raises
	# "image token count" mismatch. Keep a safe floor for multimodal DPO.
	self.max_length = max(max_length or 0, 768) if max_length is not None else None

	def __call__(self, examples):
	prompts = [example["prompt"] for example in examples]
	chosens = [example["chosen"] for example in examples]
	rejecteds = [example["rejected"] for example in examples]
	images = [example["image"] for example in examples]

	full_texts = [f"{prompt}{chosen}" for prompt, chosen in zip(prompts, chosens)]
	full_texts.extend(f"{prompt}{rejected}" for prompt, rejected in zip(prompts, rejecteds))
	repeated_prompts = prompts + prompts
	repeated_images = images + images

	batch = self.processor(
	text=full_texts,
	images=repeated_images,
	return_tensors="pt",
	padding=True,
	truncation=False,
	)

	prompt_batch = self.processor(
	text=repeated_prompts,
	images=repeated_images,
	return_tensors="pt",
	padding=True,
	truncation=False,
	)

	completion_mask = torch.zeros_like(batch["input_ids"], dtype=torch.long)
	prompt_lengths = prompt_batch["attention_mask"].sum(dim=1)
	for i, prompt_len in enumerate(prompt_lengths.tolist()):
	token_positions = batch["attention_mask"][i].nonzero(as_tuple=True)[0]
	completion_mask[i, token_positions[prompt_len:]] = 1

	if self.max_length is not None and batch["input_ids"].shape[1] > self.max_length:
	batch["input_ids"] = batch["input_ids"][:, :self.max_length]
	batch["attention_mask"] = batch["attention_mask"][:, :self.max_length]
	completion_mask = completion_mask[:, :self.max_length]
	for key in ("token_type_ids", "mm_token_type_ids"):
	if key in batch:
	batch[key] = batch[key][:, :self.max_length]

	batch["completion_mask"] = completion_mask
	return batch

	dpo_sequence_limits = {
	"max_length": max(int(config['train'].get('dpo_max_length', 768)), 768),
	"max_prompt_length": int(config['train'].get('dpo_max_prompt_length', 96)),
	"max_completion_length": int(config['train'].get('dpo_max_completion_length', 24)),
	}
	training_args_dict = {
	"output_dir": "./checkpoints/DPO",
	"per_device_train_batch_size": int(config['train'].get('dpo_batch_size', 1)),
	"gradient_accumulation_steps": int(config['train'].get('dpo_gradient_accumulation_steps', 8)),
	"num_train_epochs": config['train'].get('dpo_epochs', 1),
	"learning_rate": float(config.get('dpo', {}).get('learning_rate', 1.0e-6)),
	"lr_scheduler_type": "cosine", # [OPTIMIZED] Giúp hội tụ mượt mà hơn
	"warmup_ratio": 0.1, # [OPTIMIZED] Tránh sốc gradient ở epoch đầu
	"bf16": True,
	"remove_unused_columns": False,
	"logging_steps": 10,
	"save_strategy": "epoch",
	"save_total_limit": 1,
	"optim": config['train'].get('dpo_optim', 'paged_adamw_8bit'),
	"gradient_checkpointing": True,
	}

	if DPOConfig is not None:
	training_args_dict["beta"] = float(config.get('dpo', {}).get('beta', 0.1))
	dpo_config_params = set(inspect.signature(DPOConfig.__init__).parameters)
	for key, value in dpo_sequence_limits.items():
	if key in dpo_config_params:
	training_args_dict[key] = value
	training_args = DPOConfig(**training_args_dict)
	else:
	training_args = build_training_arguments(TrainingArguments, **training_args_dict)
	training_args.model_init_kwargs = None

	dpo_kwargs = {
	"model": model,
	"args": training_args,
	"train_dataset": dpo_hf_dataset,
	"data_collator": MultimodalDPODataCollator(processor, max_length=dpo_sequence_limits["max_length"]),
	}
	dpo_trainer_params = set(inspect.signature(DPOTrainer.__init__).parameters)
	for key, value in dpo_sequence_limits.items():
	if key in dpo_trainer_params:
	dpo_kwargs[key] = value

	try:
	print("[INFO] Thử khởi tạo DPOTrainer với processing_class...")
	trainer = DPOTrainer(**dpo_kwargs, processing_class=processor)
	except TypeError:
	try:
	trainer = DPOTrainer(**dpo_kwargs, tokenizer=processor)
	except TypeError:
	trainer = DPOTrainer(**dpo_kwargs, tokenizer=processor.tokenizer)

	print("[INFO] Bắt đầu huấn luyện DPO...")
	trainer.train()
	os.makedirs("checkpoints", exist_ok=True)
	final_dpo_dir = Path("checkpoints/DPO/final_adapter")
	final_dpo_dir.mkdir(parents=True, exist_ok=True)
	model.save_pretrained(str(final_dpo_dir))
	processor.save_pretrained(str(final_dpo_dir))
	with open("checkpoints/medical_vqa_dpo_from.txt", "w", encoding="utf-8") as f:
	f.write(str(b2_checkpoint))

	# [FIX] Đánh giá DPO sau khi train xong để có Accuracy, F1, BLEU cho biểu đồ so sánh
	from src.engine.medical_eval import evaluate_multimodal_vqa
	print("[INFO] Đang chạy đánh giá nghiệm thu trên tập Validation cho DPO...")
	model.eval()
	metrics = evaluate_multimodal_vqa(
	model,
	val_loader,
	device,
	processor,
	beam_width=config['eval'].get('beam_width_b', 1),
	beam_width_closed=config['eval'].get('beam_width_b_closed', 1),
	beam_width_open=config['eval'].get('beam_width_b_open', config['eval'].get('beam_width_b', 1)),
	max_new_tokens_closed=config['eval'].get('max_new_tokens_b_closed', 4),
	max_new_tokens_open=config['eval'].get('max_new_tokens_b_open', answer_max_words + 6),
	generation_batch_size=config['eval'].get('generation_batch_size_b', 1),
	max_words=answer_max_words,
	variant='DPO'
	)

	closed_eval = metrics.get('closed_eval', {})
	open_eval = metrics.get('open_eval', {})

	print(f"\n[RESULT DPO - CLOSED QUESTIONS]")
	print(f"Count: {closed_eval.get('count', 0)}")
	print(f"Accuracy: {closed_eval.get('accuracy', 0):.4f}")
	print(f"EM: {closed_eval.get('em', 0):.4f}")
	print(f"F1: {closed_eval.get('f1', 0):.4f}")

	print(f"\n[RESULT DPO - OPEN QUESTIONS]")
	print(f"Count: {open_eval.get('count', 0)}")
	print(f"Semantic: {open_eval.get('semantic', 0):.4f}")
	print(f"BERTScore: {open_eval.get('bert_score', 0):.4f}")
	print(f"F1: {open_eval.get('f1', 0):.4f}")
	print(f"ROUGE-L: {open_eval.get('rouge_l', 0):.4f}")

	final_epoch = training_args.num_train_epochs
	trainer.state.log_history.append({
	"epoch": final_epoch,
	"val_accuracy_normalized": metrics.get('accuracy_normalized'),
	"val_f1_normalized": metrics.get('f1_normalized'),
	"val_bleu4_normalized": metrics.get('bleu4_normalized'),
	"val_bert_score_raw": metrics.get('bert_score_raw'),
	"val_semantic_raw": metrics.get('semantic_raw'),
	"val_closed_accuracy": closed_eval.get('accuracy', 0),
	"val_closed_em": closed_eval.get('em', 0),
	"val_closed_f1": closed_eval.get('f1', 0),
	"val_open_semantic": open_eval.get('semantic', 0),
	"val_open_bertscore": open_eval.get('bert_score', 0),
	"val_open_f1": open_eval.get('f1', 0),
	"val_open_rouge_l": open_eval.get('rouge_l', 0),
	})
	b2_metrics = load_latest_variant_metrics(os.path.join(config['log_dir'], "history"), "B2")
	dpo_acceptance = evaluate_dpo_acceptance(b2_metrics, trainer.state.log_history[-1])
	trainer.state.log_history[-1]["dpo_acceptance"] = dpo_acceptance
	print(f"[INFO] {dpo_acceptance['summary']}")
	if dpo_acceptance["status"] == "accepted":
	print("[SUCCESS] DPO accepted: dat tieu chi refinement nhe tren B2.")
	elif dpo_acceptance["status"] == "failed":
	print("[WARN] DPO failed, keep B2. Khong khuyen nghi tiep tuc tuning them.")
	os.makedirs("checkpoints/DPO", exist_ok=True)
	with open("checkpoints/DPO/acceptance_summary.json", "w", encoding="utf-8") as f:
	json.dump(dpo_acceptance, f, ensure_ascii=False, indent=2)

	save_history_records(history_dir, trainer.state.log_history)
	print("[SUCCESS] Đã lưu checkpoint và metrics DPO.")
	return

	elif args.variant == 'B2':
	# Fine-tuning LLaVA-Med
	from transformers import TrainingArguments, Trainer
	from datasets import Dataset as HFDataset

	wrapper = MultimodalVQA(
	model_id=config['model_b']['model_name'],
	lora_r=int(config['model_b'].get('lora_r', 16)),
	lora_alpha=int(config['model_b'].get('lora_alpha', 32)),
	lora_dropout=float(config['model_b'].get('lora_dropout', 0.05)),
	lora_target_modules=config['model_b'].get('lora_target_modules'),
	)
	model, processor = wrapper.load_model()

	def make_sft_dataset(raw_ds):
	prompts = []
	answers = []
	texts = []
	images = []
	for i in range(len(raw_ds)):
	item = raw_ds[i]
	if isinstance(item, dict):
	q = item.get("question_vi", item.get("question", item.get("raw_questions", "")))
	a = get_target_answer(item, max_words=answer_max_words)
	answer_type = str(item.get("answer_type", "")).upper()
	label_closed = item.get("label_closed", None)
	if answer_type == "CLOSED" or label_closed in (0, 1) or a in {"có", "không", "yes", "no"}:
	a_norm = str(a).strip().lower()
	a = "không" if a_norm in {"không", "khong", "no", "false", "absent"} else "có"
	prompt = wrapper.build_instruction_prompt(q, language="vi", include_answer=False)
	prompts.append(prompt)
	answers.append(a)
	eos = processor.tokenizer.eos_token or ""
	texts.append(f"{prompt} {a}{eos}")

	img = item.get("image", None)
	if img is not None:
	if img.mode != "RGB": img = img.convert("RGB")
	images.append(img)
	return HFDataset.from_dict({"prompt": prompts, "answer": answers, "text": texts, "image": images})

	if hf_repo:
	sft_train = make_sft_dataset(dataset_dict['train'])
	sft_val = make_sft_dataset(dataset_dict['validation'])
	else:
	sft_train = make_sft_dataset(train_ds)
	sft_val = make_sft_dataset(val_ds)

	class MultimodalDataCollator:
	def __init__(self, processor, max_length=None):
	self.processor = processor
	self.tokenizer = processor.tokenizer
	self.max_length = max_length
	def __call__(self, examples):
	texts = [example["text"] for example in examples]
	prompts = [example["prompt"] for example in examples]
	images = [example["image"] for example in examples]

	batch = self.processor(
	text=texts,
	images=images,
	return_tensors="pt",
	padding=True,
	)
	labels = batch["input_ids"].clone()
	labels[labels == self.tokenizer.pad_token_id] = -100

	# Mask the full prompt so SFT loss is computed only on the answer.
	# Searching for "ASSISTANT:" token ids is brittle because tokenization can
	# split the separator differently across models.
	prompt_batch = self.processor(
	text=prompts,
	images=images,
	return_tensors="pt",
	padding=True,
	)
	prompt_lengths = prompt_batch["attention_mask"].sum(dim=1)
	for i, prompt_len in enumerate(prompt_lengths.tolist()):
	token_positions = batch["attention_mask"][i].nonzero(as_tuple=True)[0]
	labels[i, token_positions[:prompt_len]] = -100

	batch["labels"] = labels
	# Remove text and image lists as Trainer only wants tensors
	return batch

	b2_micro_batch = int(config['train'].get('b2_batch_size', 1))
	b2_grad_accum = int(config['train'].get('b2_gradient_accumulation_steps', max(config['train'].get('gradient_accumulation_steps', 2), 1)))
	b2_max_length = int(config['train'].get('b2_max_length', config['data'].get('max_question_len', 64) + config['data'].get('max_answer_len', 20) + 32))

	training_args = build_training_arguments(
	TrainingArguments,
	output_dir="./checkpoints/B2",
	per_device_train_batch_size=b2_micro_batch,
	per_device_eval_batch_size=int(config['train'].get('b2_eval_batch_size', 1)),
	gradient_accumulation_steps=b2_grad_accum,
	num_train_epochs=config['train'].get('epochs', 3),
	learning_rate=float(config['train'].get('b2_lr', 2.0e-5)),
	lr_scheduler_type="cosine",
	warmup_steps=int(config['train'].get('b2_warmup_steps', 50)),
	bf16=True,
	fp16=False,
	gradient_checkpointing=True,
	remove_unused_columns=False,
	logging_steps=10,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	save_total_limit=2,
	optim=config['train'].get('b2_optim', 'paged_adamw_8bit'),
	max_grad_norm=float(config['train'].get('grad_clip', 1.0)),
	dataloader_num_workers=int(config['train'].get('b2_num_workers', 4)),
	dataloader_pin_memory=bool(config['train'].get('pin_memory', True)),
	load_best_model_at_end=config['train'].get('b2_load_best_model_at_end', True),
	metric_for_best_model=config['train'].get('b2_metric_for_best', 'eval_loss'),
	greater_is_better=False,
	)

	training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=sft_train,
	eval_dataset=sft_val,
	data_collator=MultimodalDataCollator(processor, max_length=b2_max_length)
	)

	trainer.train()

	# [FIX] Đánh giá B2 sau khi train xong để có Accuracy, F1, BLEU cho biểu đồ so sánh
	from src.engine.medical_eval import evaluate_multimodal_vqa
	print("[INFO] Đang chạy đánh giá nghiệm thu trên tập Validation cho B2...")
	# Đưa model về evaluation mode
	model.eval()
	metrics = evaluate_multimodal_vqa(
	model,
	val_loader,
	device,
	processor,
	beam_width=config['eval'].get('beam_width_b', 1),
	beam_width_closed=config['eval'].get('beam_width_b_closed', 1),
	beam_width_open=config['eval'].get('beam_width_b_open', config['eval'].get('beam_width_b', 1)),
	max_new_tokens_closed=config['eval'].get('max_new_tokens_b_closed', 4),
	max_new_tokens_open=config['eval'].get('max_new_tokens_b_open', answer_max_words + 6),
	generation_batch_size=config['eval'].get('generation_batch_size_b', 1),
	max_words=answer_max_words,
	variant='B2'
	)

	closed_eval = metrics.get('closed_eval', {})
	open_eval = metrics.get('open_eval', {})

	print(f"\n[RESULT B2 - CLOSED QUESTIONS]")
	print(f"Count: {closed_eval.get('count', 0)}")
	print(f"Accuracy: {closed_eval.get('accuracy', 0):.4f}")
	print(f"EM: {closed_eval.get('em', 0):.4f}")
	print(f"F1: {closed_eval.get('f1', 0):.4f}")

	print(f"\n[RESULT B2 - OPEN QUESTIONS]")
	print(f"Count: {open_eval.get('count', 0)}")
	print(f"Semantic: {open_eval.get('semantic', 0):.4f}")
	print(f"BERTScore: {open_eval.get('bert_score', 0):.4f}")
	print(f"F1: {open_eval.get('f1', 0):.4f}")
	print(f"ROUGE-L: {open_eval.get('rouge_l', 0):.4f}")

	if 'long_answers_eval' in metrics:
	print(f"\n[RESULT B2 - LONG METRICS]")
	print(f"Accuracy: {metrics['long_answers_eval'].get('accuracy', 0):.4f}")
	print(f"F1: {metrics['long_answers_eval'].get('f1', 0):.4f}")
	print(f"Semantic: {metrics['long_answers_eval'].get('semantic', 0):.4f}")
	print(f"BERTScore: {metrics['long_answers_eval'].get('bert_score', 0):.4f}")

	# Gắn thêm vào log_history cho wandb
	trainer.state.log_history.append({
	"epoch": training_args.num_train_epochs,
	"val_long_accuracy": metrics['long_answers_eval'].get('accuracy', 0),
	"val_long_f1": metrics['long_answers_eval'].get('f1', 0),
	"val_long_semantic": metrics['long_answers_eval'].get('semantic', 0),
	"val_long_bertscore": metrics['long_answers_eval'].get('bert_score', 0),
	})

	# Gắn kết quả vào history để compare_models.py đọc được
	final_epoch = training_args.num_train_epochs
	trainer.state.log_history.append({
	"epoch": final_epoch,
	"val_accuracy_normalized": metrics.get('accuracy_normalized'),
	"val_f1_normalized": metrics.get('f1_normalized'),
	"val_bleu4_normalized": metrics.get('bleu4_normalized'),
	"val_bert_score_raw": metrics.get('bert_score_raw'),
	"val_semantic_raw": metrics.get('semantic_raw'),
	"val_closed_accuracy": closed_eval.get('accuracy', 0),
	"val_closed_em": closed_eval.get('em', 0),
	"val_closed_f1": closed_eval.get('f1', 0),
	"val_open_semantic": open_eval.get('semantic', 0),
	"val_open_bertscore": open_eval.get('bert_score', 0),
	"val_open_f1": open_eval.get('f1', 0),
	"val_open_rouge_l": open_eval.get('rouge_l', 0),
	})

	save_history_records(history_dir, trainer.state.log_history)
	return

	elif args.variant == 'B1':
	# Zero-shot Evaluation cho Hướng B
	from src.engine.medical_eval import evaluate_multimodal_vqa

	wrapper = MultimodalVQA(model_id=config['model_b']['model_name'])
	model, processor = wrapper.load_model()

	beam_width = config['eval'].get('beam_width_b', 1)
	print(f"[INFO] Bắt đầu đánh giá B1 với Beam Width = {beam_width}...")

	metrics = evaluate_multimodal_vqa(
	model,
	val_loader,
	device,
	processor,
	beam_width=beam_width,
	beam_width_closed=config['eval'].get('beam_width_b_closed', beam_width),
	beam_width_open=config['eval'].get('beam_width_b_open', beam_width),
	max_new_tokens_closed=config['eval'].get('max_new_tokens_b_closed', 4),
	max_new_tokens_open=config['eval'].get('max_new_tokens_b_open', answer_max_words + 6),
	generation_batch_size=config['eval'].get('generation_batch_size_b', 1),
	max_words=answer_max_words,
	variant='B1'
	)

	closed_eval = metrics.get('closed_eval', {})
	open_eval = metrics.get('open_eval', {})

	print(f"\n[RESULT B1 - CLOSED QUESTIONS]")
	print(f"Count: {closed_eval.get('count', 0)}")
	print(f"Accuracy: {closed_eval.get('accuracy', 0):.4f}")
	print(f"EM: {closed_eval.get('em', 0):.4f}")
	print(f"F1: {closed_eval.get('f1', 0):.4f}")

	print(f"\n[RESULT B1 - OPEN QUESTIONS]")
	print(f"Count: {open_eval.get('count', 0)}")
	print(f"Semantic: {open_eval.get('semantic', 0):.4f}")
	print(f"BERTScore: {open_eval.get('bert_score', 0):.4f}")
	print(f"F1: {open_eval.get('f1', 0):.4f}")
	print(f"ROUGE-L: {open_eval.get('rouge_l', 0):.4f}")

	if 'long_answers_eval' in metrics:
	print(f"\n[RESULT B1 - LONG METRICS]")
	print(f"Accuracy: {metrics['long_answers_eval'].get('accuracy', 0):.4f}")
	print(f"F1: {metrics['long_answers_eval'].get('f1', 0):.4f}")
	print(f"Semantic: {metrics['long_answers_eval'].get('semantic', 0):.4f}")
	print(f"BERTScore: {metrics['long_answers_eval'].get('bert_score', 0):.4f}")
	# [FIX] Lưu dưới dạng record có 'epoch' để compare_models.py có thể parse
	save_history_records(history_dir, [{
	"epoch": 1,
	"variant": "B1",
	"beam_width": beam_width,
	"train_loss": 0.0, # zero-shot không có train loss
	"val_accuracy_normalized": float(metrics.get('accuracy_normalized', metrics.get('accuracy', 0))),
	"val_f1_normalized": float(metrics.get('f1_normalized', metrics.get('f1', 0))),
	"val_bleu4_normalized": float(metrics.get('bleu4_normalized', metrics.get('bleu4', 0))),
	"val_bert_score_raw": float(metrics.get('bert_score_raw', metrics.get('bert_score', 0))),
	"val_semantic_raw": float(metrics.get('semantic_raw', metrics.get('semantic', 0))),
	"val_closed_accuracy": float(closed_eval.get('accuracy', 0)),
	"val_closed_em": float(closed_eval.get('em', 0)),
	"val_closed_f1": float(closed_eval.get('f1', 0)),
	"val_open_semantic": float(open_eval.get('semantic', 0)),
	"val_open_bertscore": float(open_eval.get('bert_score', 0)),
	"val_open_f1": float(open_eval.get('f1', 0)),
	"val_open_rouge_l": float(open_eval.get('rouge_l', 0)),
	"metrics": metrics,
	}])
	return

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--config", type=str, default="configs/medical_vqa.yaml")
	parser.add_argument("--variant", type=str, choices=['A1', 'A2', 'B1', 'B2', 'DPO', 'PPO'], required=True)
	parser.add_argument("--debug", action="store_true")
	parser.add_argument("--no_compare", action="store_true",
	help="Bỏ qua vẽ chart so sánh 5 model sau khi train xong")
	args = parser.parse_args()
	train(args)

	# Auto-generate comparison charts after training
	if not args.no_compare:
	import subprocess, sys
	log_dir = "logs/medical_vqa/history"
	out_dir = "results/charts"
	print(f"\n[INFO] 📊 Tự động vẽ biểu đồ so sánh 5 model → {out_dir}/")
	try:
	subprocess.run(
	[sys.executable, "scripts/compare_models.py",
	"--log_dir", log_dir, "--out", out_dir],
	check=False
	)
	except Exception as e:
	print(f"[WARNING] compare_models.py thất bại: {e}")
	print(" Chạy thủ công: python scripts/compare_models.py")