| |
| """ |
| Test Fine-tuned Model vs Original |
| |
| Compare the fine-tuned model with the original FLAN-T5 |
| on our target words: PANESAR, RAJOURI, XANTHIC |
| """ |
|
|
| import torch |
| from pathlib import Path |
| from typing import List, Dict |
|
|
| try: |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| TRANSFORMERS_AVAILABLE = True |
| except ImportError: |
| TRANSFORMERS_AVAILABLE = False |
|
|
|
|
| class ModelComparison: |
| """Compare original vs fine-tuned models""" |
| |
| def __init__(self): |
| self.cache_dir = Path(__file__).parent.parent / "cache-dir" |
| self.fine_tuned_dir = Path(__file__).parent / "fine_tuned_model" |
| |
| self.original_model = None |
| self.original_tokenizer = None |
| self.fine_tuned_model = None |
| self.fine_tuned_tokenizer = None |
| |
| def load_models(self): |
| """Load both original and fine-tuned models""" |
| print("π Loading original FLAN-T5-small...") |
| |
| |
| self.original_tokenizer = AutoTokenizer.from_pretrained( |
| "google/flan-t5-small", |
| cache_dir=str(self.cache_dir) |
| ) |
| self.original_model = AutoModelForSeq2SeqLM.from_pretrained( |
| "google/flan-t5-small", |
| cache_dir=str(self.cache_dir) |
| ) |
| |
| print("β
Original model loaded") |
| |
| |
| if self.fine_tuned_dir.exists(): |
| print("π Loading fine-tuned model...") |
| |
| self.fine_tuned_tokenizer = AutoTokenizer.from_pretrained( |
| str(self.fine_tuned_dir) |
| ) |
| self.fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained( |
| str(self.fine_tuned_dir) |
| ) |
| |
| print("β
Fine-tuned model loaded") |
| else: |
| print("β Fine-tuned model not found - run training first") |
| return False |
| |
| return True |
| |
| def generate_clue(self, model, tokenizer, word: str) -> str: |
| """Generate a clue using the specified model""" |
| prompt = f"Generate a crossword clue for: {word}" |
| |
| inputs = tokenizer(prompt, return_tensors="pt") |
| |
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=20, |
| num_beams=3, |
| temperature=0.7, |
| do_sample=True, |
| early_stopping=True, |
| pad_token_id=tokenizer.pad_token_id |
| ) |
| |
| result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| |
| |
| if prompt in result: |
| result = result.replace(prompt, "").strip() |
| |
| return result |
| |
| def compare_models(self): |
| """Compare models on target words""" |
| target_words = [ |
| "PANESAR", |
| "TENDULKAR", |
| "RAJOURI", |
| "XANTHIC", |
| "SERENDIPITY", |
| "BEETHOVEN", |
| "PIANO", |
| ] |
| |
| print("\n㪠COMPARING ORIGINAL vs FINE-TUNED") |
| print("=" * 70) |
| |
| results = [] |
| |
| for word in target_words: |
| print(f"\nπ {word}:") |
| |
| |
| original_clue = self.generate_clue( |
| self.original_model, |
| self.original_tokenizer, |
| word |
| ) |
| |
| |
| fine_tuned_clue = self.generate_clue( |
| self.fine_tuned_model, |
| self.fine_tuned_tokenizer, |
| word |
| ) |
| |
| print(f" Original: \"{original_clue}\"") |
| print(f" Fine-tuned: \"{fine_tuned_clue}\"") |
| |
| |
| in_training = word.upper() in ["TENDULKAR", "BEETHOVEN", "PIANO"] |
| |
| if in_training: |
| print(f" Note: This word WAS in training data") |
| else: |
| print(f" Note: This word was NOT in training data") |
| |
| results.append({ |
| "word": word, |
| "original": original_clue, |
| "fine_tuned": fine_tuned_clue, |
| "in_training": in_training |
| }) |
| |
| |
| print("\n" + "=" * 70) |
| print("π ANALYSIS") |
| print("=" * 70) |
| |
| print("\nπ― Words in Training Data:") |
| for result in results: |
| if result["in_training"]: |
| print(f" {result['word']:12} β \"{result['fine_tuned']}\"") |
| |
| print("\nπ Words NOT in Training Data (Transfer Learning Test):") |
| for result in results: |
| if not result["in_training"]: |
| print(f" {result['word']:12} β \"{result['fine_tuned']}\"") |
| |
| print(f"\nπ‘ CONCLUSIONS:") |
| print(f"1. If fine-tuned model is worse on training data words,") |
| print(f" then fine-tuning failed completely") |
| print(f"2. If it's better on training data but bad on new words,") |
| print(f" then it overfitted and didn't generalize") |
| print(f"3. If it's better on both, then transfer learning succeeded!") |
| |
| def test_training_examples(self): |
| """Test on exact training examples to check if model learned""" |
| print("\nπ Testing on EXACT Training Examples:") |
| print("=" * 50) |
| |
| training_examples = [ |
| ("PIANO", "88-key instrument"), |
| ("BEETHOVEN", "Austrian composer"), |
| ("OXYGEN", "Life-sustaining gas"), |
| ("EINSTEIN", "Relativity physicist"), |
| ] |
| |
| for word, expected in training_examples: |
| generated = self.generate_clue( |
| self.fine_tuned_model, |
| self.fine_tuned_tokenizer, |
| word |
| ) |
| |
| print(f"{word:12}: Expected: \"{expected}\"") |
| print(f"{'':12} Generated: \"{generated}\"") |
| |
| |
| if any(exp_word in generated.lower() for exp_word in expected.lower().split()): |
| print(f"{'':12} Status: β
Some similarity") |
| else: |
| print(f"{'':12} Status: β No similarity") |
| print() |
|
|
|
|
| def main(): |
| """Main function""" |
| print("π§ͺ FINE-TUNED MODEL EVALUATION") |
| print("=" * 50) |
| |
| if not TRANSFORMERS_AVAILABLE: |
| print("β Need transformers library") |
| return |
| |
| comparison = ModelComparison() |
| |
| if not comparison.load_models(): |
| return |
| |
| |
| comparison.test_training_examples() |
| |
| |
| comparison.compare_models() |
|
|
|
|
| if __name__ == "__main__": |
| main() |