| import torch |
| from transformers import AutoTokenizer |
| from llama_modeling.front_end import LlamaForCausalLM |
| from llama_modeling.config import LlamaConfig |
| import json |
| import sys |
| from utils.trainutils import load_checkpoint |
|
|
| def generate_text(model, tokenizer, prompt, max_new_tokens=30): |
| input_ids = tokenizer.encode(prompt, return_tensors='pt').to("cuda") |
| |
| with torch.inference_mode(): |
| outputs = model.generate( |
| input_ids, |
| max_new_tokens=max_new_tokens, |
| temperature=0.7 |
| ) |
| |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
| def main(): |
| if len(sys.argv) != 2: |
| print("Usage: python inference.py <path_to_model>") |
| sys.exit(1) |
| |
| model_path = sys.argv[1] |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| |
| with open("config.json") as f: |
| config_dict = json.load(f) |
| config = LlamaConfig(**{k: v for k, v in config_dict.items() if k in LlamaConfig.__dataclass_fields__}) |
| |
| model = LlamaForCausalLM(config).to(device) |
| |
| load_checkpoint(model, model_path) |
| model.eval() |
| |
| tokenizer = AutoTokenizer.from_pretrained("./SmolLM2-135M-Instruct") |
| |
| prompts = [ |
| "Once upon a time,", |
| "The best way to learn programming is", |
| "Here's a recipe for chocolate cake:" |
| ] |
| |
| with torch.no_grad(), torch.autocast(device_type='cuda', dtype=None): |
| for prompt in prompts: |
| print(f"\nPrompt: {prompt}") |
| output = generate_text(model, tokenizer, prompt) |
| print(f"Generated: {output}") |
| print("-" * 50) |
|
|
| if __name__ == "__main__": |
| main() |