| import os |
| import streamlit as st |
|
|
| |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline |
| from langchain_huggingface import HuggingFacePipeline |
| from langchain.prompts import PromptTemplate |
| from langchain.schema import StrOutputParser |
|
|
| |
| from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
| from llama_index.llms.huggingface import HuggingFaceLLM |
|
|
| st.set_page_config(page_title="Tiny LLM Starter", page_icon="🧪", layout="centered") |
| st.title("🧪 Tiny LLM Starter – LangChain + LlamaIndex") |
|
|
| |
| st.sidebar.header("Model Settings") |
| MODEL_ID = st.sidebar.text_input("HF model id (seq2seq)", value="google/flan-t5-small") |
| MAX_NEW_TOKENS = st.sidebar.slider("max_new_tokens", 32, 512, 256, 32) |
| TEMP = st.sidebar.slider("temperature", 0.0, 1.0, 0.2, 0.1) |
|
|
| st.sidebar.markdown( |
| """ |
| **Tips** |
| - Uses local CPU (no key required) |
| - Small model → lower memory, faster cold start |
| - You can later add an `HF_TOKEN` secret for hosted inference |
| """ |
| ) |
|
|
| |
| @st.cache_resource(show_spinner=True) |
| def load_langchain_pipeline(model_id: str, max_new_tokens: int): |
| tok = AutoTokenizer.from_pretrained(model_id) |
| mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id) |
| gen = pipeline( |
| task="text2text-generation", |
| model=mdl, |
| tokenizer=tok, |
| max_new_tokens=max_new_tokens, |
| ) |
| return HuggingFacePipeline(pipeline=gen) |
|
|
| @st.cache_resource(show_spinner=True) |
| def load_llamaindex_stack(model_id: str, max_new_tokens: int, temperature: float): |
| |
| embed = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
| |
| llm = HuggingFaceLLM( |
| model_name=model_id, |
| tokenizer_name=model_id, |
| context_window=2048, |
| generate_kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature}, |
| device_map="cpu", |
| ) |
|
|
| Settings.embed_model = embed |
| Settings.llm = llm |
|
|
| |
| docs = SimpleDirectoryReader(input_dirs=["data"]).load_data() |
| index = VectorStoreIndex.from_documents(docs) |
| query_engine = index.as_query_engine(similarity_top_k=3) |
| return query_engine |
|
|
| tab1, tab2 = st.tabs(["🟣 LangChain Chat", "🟡 LlamaIndex mini-RAG"]) |
|
|
| |
| with tab1: |
| st.subheader("LangChain (local HF pipeline)") |
| lc_llm = load_langchain_pipeline(MODEL_ID, MAX_NEW_TOKENS) |
|
|
| user_q = st.text_input("Ask anything:", value="What is this app?") |
| if st.button("Generate (LangChain)", type="primary"): |
| prompt = PromptTemplate.from_template( |
| "You are a concise, helpful assistant.\n\nQuestion: {q}\nAnswer:" |
| ) |
| chain = prompt | lc_llm | StrOutputParser() |
| with st.spinner("Thinking..."): |
| out = chain.invoke({"q": user_q}) |
| st.write(out) |
|
|
| |
| with tab2: |
| st.subheader("LlamaIndex over a tiny text file") |
| st.caption("Uploads are optional; otherwise it uses ./data/notes.txt") |
| uploaded = st.file_uploader("Upload a .txt file to index (optional)", type=["txt"]) |
|
|
| |
| if uploaded is not None: |
| os.makedirs("data", exist_ok=True) |
| with open(os.path.join("data", "user.txt"), "wb") as f: |
| f.write(uploaded.read()) |
|
|
| qe = load_llamaindex_stack(MODEL_ID, MAX_NEW_TOKENS, TEMP) |
|
|
| rag_q = st.text_input("Ask about the indexed text:", value="What does the notes file say?") |
| if st.button("Search + Answer (LlamaIndex)"): |
| with st.spinner("Searching + generating..."): |
| ans = qe.query(rag_q) |
| st.write(ans.response) |
| with st.expander("Show retrieved nodes"): |
| for n in ans.source_nodes: |
| st.markdown(f"**Score:** {n.score:.3f}") |
| st.code(n.node.get_content()[:500]) |
|
|