import torch from transformers import AutoModelForCausalLM, AutoTokenizer # Define the model path (adjust as needed) MODEL_PATH = "model_files" # Your fine-tuned model path # System prompt for guiding model behavior DEFAULT_PROMPT = """<|system|> You are a compassionate listener. Respond with: - Short, natural sentences - Occasional empathetic sounds ("Oh...", "I see") - Open-ended questions when appropriate - Validation before advice - Clear crisis handoff when needed Examples of good responses: 1. "That sounds really overwhelming. Can you tell me more about what's been happening?" 2. "I'm hearing a lot of pain in what you're sharing. Have you talked to anyone about this?" 3. "This seems really important. Let's focus on how you're feeling right now." """ def load_model(): """ Loads the fine-tuned model and tokenizer with optimizations for memory and performance. Returns: model: The loaded Hugging Face model. tokenizer: The corresponding tokenizer. device: The device (CPU/GPU) the model is loaded on. """ print(f"🔍 Loading model from: {MODEL_PATH}") # 1. Load Tokenizer tokenizer = AutoTokenizer.from_pretrained( MODEL_PATH, cache_dir="./cache", # Cache directory for faster reloads use_fast=True, # Use the fast tokenizer for better performance padding_side="left" # Ensure padding is consistent for generation ) # 2. Load Model with Memory Optimization model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, cache_dir="./cache", trust_remote_code=True, # Allow custom model code torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # Use FP16 on GPU device_map="auto", # Automatically map model to available devices load_in_4bit=True if torch.cuda.is_available() else False # Quantize to 4-bit on GPU ) # 3. Set Device device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) print("✅ Model successfully loaded.") return model, tokenizer, device # Test the loader if __name__ == "__main__": model, tokenizer, device = load_model() print("Model and tokenizer successfully loaded.")