Dyno1307
/

translator

Model card Files Files and versions

translator / baseline_analysis.py

Dyno1307's picture

Upload 67 files

e599a18 verified 6 months ago

history blame contribute delete

2.14 kB

	# baseline_analysis.py

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import torch

	# Define the model we want to use. We'll use a distilled (smaller, faster)
	# version of NLLB-200 for this quick test.
	model_name = "facebook/nllb-200-distilled-600M"

	# Load the pre-trained tokenizer and model from Hugging Face.
	# This might take a minute to download the first time.
	print(f"Loading model: {model_name}")
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	print("Model loaded successfully!")

	# Sentences we want to translate.
	sinhala_sentences = [
	"ඩෝසන් මිස් දුරකථනයෙන් ඩෝසන් මිස් කවුද සර්",
	"කවුද ඩෝසන් නැතුව ඉන්නේ ඔව් සර්",
	"ඔබ එය උත්සාහ කරන්න සර්",
	"කොහොමද වැඩේ හරිද ඔව් සර්ට ස්තුතියි",
	"ඔව්, හරි, ස්තුතියි රත්තරං"
	]

	print("\n--- Starting Translation ---")

	# Loop through each sentence and translate it.
	for sentence in sinhala_sentences:

	# 1. Prepare the input for the model
	# We need to tell the tokenizer what the source language is.
	tokenizer.src_lang = "sin_Sinh"

	# Convert the text into a format the model understands (input IDs).
	inputs = tokenizer(sentence, return_tensors="pt")

	# 2. Generate the translation
	# We force the model to output English by setting the target language ID.
	target_lang = "eng_Latn"
	translated_tokens = model.generate(
	**inputs,
	forced_bos_token_id=tokenizer.vocab[target_lang],
	max_length=50 # Set a max length for the output
	)

	# 3. Decode the output
	# Convert the model's output tokens back into readable text.
	translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

	# 4. Display the results
	print(f"\nOriginal (si): {sentence}")
	print(f"Translation (en): {translation}")

	print("\n--- Translation Complete ---")