#CORE LLM CLIENT โ€” core/llm_client.py #from core.rag import retrieve_relevant_chunks import os from dotenv import load_dotenv from groq import Groq #from huggingface_hub import InferenceClient # ๐Ÿ”ฅ Load .env file load_dotenv() # ๐Ÿ” Load API Key from environment #""" GROQ_API_KEY = os.getenv("GROQ_API_KEY") if not GROQ_API_KEY: raise ValueError("โŒ GROQ_API_KEY not set in environment variables") #""" """ HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: raise ValueError("โŒ HF_TOKEN not set in environment variables") """ # Initialize client client = Groq(api_key=GROQ_API_KEY) #client = InferenceClient(api_key=os.environ["HF_TOKEN"]) #client = InferenceClient(api_key=HF_TOKEN) def ask_llm(prompt: str) -> str: print("\n๐Ÿš€ LLM CALL INITIATED") #print("๐Ÿ”‘ API KEY PRESENT:", bool(os.getenv("GROQ_API_KEY"))) print("๐Ÿงพ PROMPT LENGTH:", len(prompt)) try: completion = client.chat.completions.create( model="llama-3.1-8b-instant", #model="meta-llama/Llama-3.1-8B-Instruct", # Or Qwen/Qwen2.5-72B-Instruct #model="ecnu-icalk/PsychAgent-Qwen3-32B", #model = "Qwen/Qwen2.5-72B-Instruct", messages=[{"role": "user", "content": prompt}], temperature=0.2, # from 0.7 #max_tokens=500, #max_tokens=2500 #provider="auto" # Automatically selects the fastest available free provider ) response = completion.choices[0].message.content #print("โœ… GROQ RESPONSE RECEIVED") print("โœ… LLM RESPONSE RECEIVED") print("๐Ÿ“ RESPONSE LENGTH:", len(response)) return response.strip() except Exception as e: #print("โŒ GROQ ERROR:", str(e)) print("โŒ LLM ERROR:", str(e)) return f"LLM Error: {str(e)}" """ def ask_llm(prompt: str) -> str: print("\n๐Ÿš€ Sending request to Groq...") try: completion = client.chat.completions.create( model="llama-3.1-8b-instant", messages=[ {"role": "user", "content": prompt} ], temperature=0.7, max_tokens=1024, ) response = completion.choices[0].message.content print("\nโœ… FINAL RESPONSE:", response) return response.strip() except Exception as e: print("โŒ LLM ERROR:", str(e)) return "Error: LLM request failed." """ """ client = ollama.Client(host='http://127.0.0.1:11434') MAX_CHARS = 3000 # safe starting point def ask_llm(prompt): if len(prompt) > MAX_CHARS: prompt = prompt[:MAX_CHARS] print("โš  Prompt truncated for performance.") print("\n>>> Sending prompt to Ollama\n") stream = client.chat( model="llama3.2:latest", messages=[{"role": "user", "content": prompt}], stream=True ) full_response = "" for chunk in stream: content = chunk["message"]["content"] print(content, end="", flush=True) full_response += content print(f"\n") return full_response """ """ MAX_CHARS = 3000 # safe starting point def ask_llm(prompt): if len(prompt) > MAX_CHARS: prompt = prompt[:MAX_CHARS] print("โš  Prompt truncated for performance.") print(">>> Sending prompt to Ollama") response = ollama.chat( #model="llama3", model="llama3.2:latest", messages=[{"role": "user", "content": prompt}], stream=False ) return response["message"]["content"] """