eval / eval_bleu.py

Upload eval_bleu.py

8be3eb3 verified 9 months ago

4.92 kB

	#!/usr/bin/env python3
	import json
	from pathlib import Path
	import re
	import torch
	from transformers import AutoTokenizer, Gemma3ForCausalLM
	from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
	from tqdm import tqdm
	import os
	import torch._dynamo


	torch._dynamo.config.suppress_errors = True
	torch.set_float32_matmul_precision('high') # Also address the warning you saw
	# ─── CONFIG ─────────────────────────────────────────────────────────
	SRC_LANG, TGT_LANG = "en", "kk"
	MODEL_PATH = "/raid/srp_base_model_training/abai_workspace/models/sync_kk_en/checkpoint-final"
	TEST_FILE = "/raid/srp_base_model_training/abai_workspace/data/flores/en_to_kk_formatted.jsonl" # JSONL with fields system,user,assistant
	OUTPUT_JSON = f"eval_sync_KKEN_data_{SRC_LANG}_to_{TGT_LANG}.json"
	MAX_NEW_TOKS = 64
	os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5" # specify your GPU IDs here
	DEVICE = "cuda" # or "cpu"
	# ────────────────────────────────────────────────────────────────────
	# Add this helper near the top
	def clean_user_field(user_str: str) -> str:
	"""
	Remove leading <src=xx><tgt=yy> tags and any whitespace/newlines after them.
	"""
	# This pattern matches <src=..><tgt=..> plus any whitespace/newline
	return re.sub(r'^<src=[^>]+><tgt=[^>]+>\s*', '', user_str)

	def load_model_and_tokenizer():
	print(f"Loading model/tokenizer from {MODEL_PATH} …")
	tok = AutoTokenizer.from_pretrained(MODEL_PATH)
	model = Gemma3ForCausalLM.from_pretrained(
	MODEL_PATH,
	torch_dtype=torch.bfloat16,
	device_map="auto", # auto‐shard if you have multiple GPUs
	)
	model.eval()
	return tok, model

	def build_prompt(system: str, user: str) -> str:
	return (
	f"<start_of_turn>system\n{system}<end_of_turn>\n"
	f"<start_of_turn>user\n{user}<end_of_turn>\n"
	f"<start_of_turn>assistant"
	)

	def run_inference(tok, model, system: str, user: str) -> str:
	prompt = build_prompt(system, user)
	inputs = tok(prompt, return_tensors="pt", truncation=True).to(model.device)
	input_len = inputs["input_ids"].shape[-1]

	with torch.inference_mode():
	out = model.generate(
	**inputs,
	max_new_tokens=MAX_NEW_TOKS,
	do_sample=False,
	eos_token_id=tok.convert_tokens_to_ids("<end_of_turn>"),
	pad_token_id=tok.eos_token_id,
	)
	gen_ids = out[0][input_len:]
	return tok.decode(gen_ids, skip_special_tokens=True).strip()

	def load_test_examples(path: str):
	examples = []
	for line in open(path, encoding="utf-8"):
	obj = json.loads(line)
	examples.append((obj["system"].strip(),
	obj["user"].strip(),
	obj["assistant"].strip()))
	return examples

	def evaluate_bleu_nltk(hyps, refs):
	"""
	Compute corpus-level 4-gram BLEU using NLTK.
	- hyps: list of hypothesis strings
	- refs: list of reference strings
	Returns BLEU in percentage (e.g. 27.53).
	"""
	# 1) tokenize on whitespace
	tokenized_hyps = [hyp.split() for hyp in hyps]
	# NLTK expects a list of references per hypothesis
	tokenized_refs = [[ref.split()] for ref in refs]

	# 2) smoothing to avoid zero scores on short examples
	smoothing = SmoothingFunction().method1

	# 3) compute corpus_bleu with uniform 4-gram weights
	score = corpus_bleu(
	tokenized_refs,
	tokenized_hyps,
	weights=(0.25, 0.25, 0.25, 0.25),
	smoothing_function=smoothing,
	)

	# convert to percentage
	return round(score, 4)

	def main():
	tok, model = load_model_and_tokenizer()
	examples = load_test_examples(TEST_FILE)
	hyps, refs, users = [], [], []

	for system, user, assistant in tqdm(examples, desc="Translating"):
	clean_user = clean_user_field(user)
	hyp = run_inference(tok, model, system, clean_user)
	hyps.append(hyp)
	refs.append(assistant)
	users.append(clean_user) # collect cleaned user texts

	bleu_score = evaluate_bleu_nltk(hyps, refs)

	# Save results, using cleaned user strings
	out = {
	"model": MODEL_PATH,
	"bleu": bleu_score,
	"examples": []
	}
	for (s, _, r), u_clean, h in zip(examples, users, hyps):
	out["examples"].append({
	"system": s,
	"user": u_clean, # no tags here
	"reference": r,
	"hypothesis": h
	})

	with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
	json.dump(out, f, ensure_ascii=False, indent=2)
	print(f"✅ Saved cleaned evaluation to {OUTPUT_JSON}")

	if __name__ == "__main__":
	main()