| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
| |
| MODEL_PATH = "model_files" |
|
|
| |
| DEFAULT_PROMPT = """<|system|> |
| You are a compassionate listener. Respond with: |
| - Short, natural sentences |
| - Occasional empathetic sounds ("Oh...", "I see") |
| - Open-ended questions when appropriate |
| - Validation before advice |
| - Clear crisis handoff when needed |
| Examples of good responses: |
| 1. "That sounds really overwhelming. Can you tell me more about what's been happening?" |
| 2. "I'm hearing a lot of pain in what you're sharing. Have you talked to anyone about this?" |
| 3. "This seems really important. Let's focus on how you're feeling right now." |
| </s>""" |
|
|
| def load_model(): |
| """ |
| Loads the fine-tuned model and tokenizer with optimizations for memory and performance. |
| Returns: |
| model: The loaded Hugging Face model. |
| tokenizer: The corresponding tokenizer. |
| device: The device (CPU/GPU) the model is loaded on. |
| """ |
| print(f"🔍 Loading model from: {MODEL_PATH}") |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained( |
| MODEL_PATH, |
| cache_dir="./cache", |
| use_fast=True, |
| padding_side="left" |
| ) |
|
|
| |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_PATH, |
| cache_dir="./cache", |
| trust_remote_code=True, |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| device_map="auto", |
| load_in_4bit=True if torch.cuda.is_available() else False |
| ) |
|
|
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| model.to(device) |
|
|
| print("✅ Model successfully loaded.") |
| return model, tokenizer, device |
|
|
| |
| if __name__ == "__main__": |
| model, tokenizer, device = load_model() |
| print("Model and tokenizer successfully loaded.") |