| |
| import json |
| from pathlib import Path |
| import re |
| import torch |
| from transformers import AutoTokenizer, Gemma3ForCausalLM |
| from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction |
| from tqdm import tqdm |
| import os |
| import torch._dynamo |
|
|
|
|
| torch._dynamo.config.suppress_errors = True |
| torch.set_float32_matmul_precision('high') |
| |
| SRC_LANG, TGT_LANG = "en", "kk" |
| MODEL_PATH = "/raid/srp_base_model_training/abai_workspace/models/sync_kk_en/checkpoint-final" |
| TEST_FILE = "/raid/srp_base_model_training/abai_workspace/data/flores/en_to_kk_formatted.jsonl" |
| OUTPUT_JSON = f"eval_sync_KKEN_data_{SRC_LANG}_to_{TGT_LANG}.json" |
| MAX_NEW_TOKS = 64 |
| os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5" |
| DEVICE = "cuda" |
| |
| |
| def clean_user_field(user_str: str) -> str: |
| """ |
| Remove leading <src=xx><tgt=yy> tags and any whitespace/newlines after them. |
| """ |
| |
| return re.sub(r'^<src=[^>]+><tgt=[^>]+>\s*', '', user_str) |
|
|
| def load_model_and_tokenizer(): |
| print(f"Loading model/tokenizer from {MODEL_PATH} β¦") |
| tok = AutoTokenizer.from_pretrained(MODEL_PATH) |
| model = Gemma3ForCausalLM.from_pretrained( |
| MODEL_PATH, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| ) |
| model.eval() |
| return tok, model |
|
|
| def build_prompt(system: str, user: str) -> str: |
| return ( |
| f"<start_of_turn>system\n{system}<end_of_turn>\n" |
| f"<start_of_turn>user\n{user}<end_of_turn>\n" |
| f"<start_of_turn>assistant" |
| ) |
|
|
| def run_inference(tok, model, system: str, user: str) -> str: |
| prompt = build_prompt(system, user) |
| inputs = tok(prompt, return_tensors="pt", truncation=True).to(model.device) |
| input_len = inputs["input_ids"].shape[-1] |
|
|
| with torch.inference_mode(): |
| out = model.generate( |
| **inputs, |
| max_new_tokens=MAX_NEW_TOKS, |
| do_sample=False, |
| eos_token_id=tok.convert_tokens_to_ids("<end_of_turn>"), |
| pad_token_id=tok.eos_token_id, |
| ) |
| gen_ids = out[0][input_len:] |
| return tok.decode(gen_ids, skip_special_tokens=True).strip() |
|
|
| def load_test_examples(path: str): |
| examples = [] |
| for line in open(path, encoding="utf-8"): |
| obj = json.loads(line) |
| examples.append((obj["system"].strip(), |
| obj["user"].strip(), |
| obj["assistant"].strip())) |
| return examples |
|
|
| def evaluate_bleu_nltk(hyps, refs): |
| """ |
| Compute corpus-level 4-gram BLEU using NLTK. |
| - hyps: list of hypothesis strings |
| - refs: list of reference strings |
| Returns BLEU in percentage (e.g. 27.53). |
| """ |
| |
| tokenized_hyps = [hyp.split() for hyp in hyps] |
| |
| tokenized_refs = [[ref.split()] for ref in refs] |
|
|
| |
| smoothing = SmoothingFunction().method1 |
|
|
| |
| score = corpus_bleu( |
| tokenized_refs, |
| tokenized_hyps, |
| weights=(0.25, 0.25, 0.25, 0.25), |
| smoothing_function=smoothing, |
| ) |
|
|
| |
| return round(score, 4) |
|
|
| def main(): |
| tok, model = load_model_and_tokenizer() |
| examples = load_test_examples(TEST_FILE) |
| hyps, refs, users = [], [], [] |
|
|
| for system, user, assistant in tqdm(examples, desc="Translating"): |
| clean_user = clean_user_field(user) |
| hyp = run_inference(tok, model, system, clean_user) |
| hyps.append(hyp) |
| refs.append(assistant) |
| users.append(clean_user) |
|
|
| bleu_score = evaluate_bleu_nltk(hyps, refs) |
|
|
| |
| out = { |
| "model": MODEL_PATH, |
| "bleu": bleu_score, |
| "examples": [] |
| } |
| for (s, _, r), u_clean, h in zip(examples, users, hyps): |
| out["examples"].append({ |
| "system": s, |
| "user": u_clean, |
| "reference": r, |
| "hypothesis": h |
| }) |
|
|
| with open(OUTPUT_JSON, "w", encoding="utf-8") as f: |
| json.dump(out, f, ensure_ascii=False, indent=2) |
| print(f"β
Saved cleaned evaluation to {OUTPUT_JSON}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|