| """LLM loading and initialization.""" |
|
|
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
| from langchain_community.llms import HuggingFacePipeline |
| import streamlit as st |
|
|
|
|
| @st.cache_resource |
| def load_qwen_llm(model_name, max_new_tokens=256, temperature=0.7, top_p=0.95): |
| """ |
| Load Qwen LLM model. |
| |
| Args: |
| model_name: HuggingFace model identifier |
| max_new_tokens: Maximum tokens to generate |
| temperature: Sampling temperature |
| top_p: Nucleus sampling parameter |
| |
| Returns: |
| HuggingFacePipeline: Wrapped LLM for LangChain |
| """ |
| |
| |
| tokenizer = AutoTokenizer.from_pretrained( |
| model_name, |
| trust_remote_code=True |
| ) |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| device_map="auto" if torch.cuda.is_available() else None, |
| trust_remote_code=True |
| ) |
| |
| |
| pipe = pipeline( |
| "text-generation", |
| model=model, |
| tokenizer=tokenizer, |
| max_new_tokens=max_new_tokens, |
| temperature=temperature, |
| top_p=top_p, |
| do_sample=True, |
| return_full_text=False |
| ) |
| |
| return HuggingFacePipeline(pipeline=pipe) |
|
|