Spaces:
Running
Running
| """ | |
| agent/llm_client.py | |
| ββββββββββββββββββββ | |
| Provider-agnostic LLM client with automatic fallback chain. | |
| Free provider priority order (best quality β fastest): | |
| 1. Groq API β free tier, DeepSeek-Coder-33B, ~500 tok/s | |
| 2. Google Gemini β free tier, 1M context, 15 RPM | |
| 3. Ollama (local) β fully offline, DeepSeek-Coder-7B/33B | |
| 4. HuggingFace TGI β free inference API | |
| 5. OpenAI β paid fallback (only if key is set) | |
| Why Groq over GPT-4o for this project: | |
| - DeepSeek-Coder-33B-Instruct scores HIGHER than GPT-4o on HumanEval | |
| (79.3% vs 67.0%), EvalPlus, and LiveCodeBench for code tasks | |
| - Inference is 10Γ faster (~500 tok/s vs ~50 tok/s) | |
| - Free tier: 30 RPM, 14,400 RPD, 6,000 tokens/min | |
| - This is a QUALITY UPGRADE, not just a cost-cutting measure | |
| Usage: | |
| from agent.llm_client import get_llm_client | |
| client = get_llm_client() # auto-detects from environment | |
| patch = client.complete(system=SYSTEM_PROMPT, user=ISSUE_TEXT) | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| import time | |
| from abc import ABC, abstractmethod | |
| from typing import Optional | |
| # Auto-load .env so scripts work without manually exporting env vars | |
| try: | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| except ImportError: | |
| pass | |
| logger = logging.getLogger(__name__) | |
| # ββ Base interface ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class LLMClient(ABC): | |
| """Provider-agnostic LLM interface.""" | |
| def complete( | |
| self, | |
| system: str, | |
| user: str, | |
| max_tokens: int = 4096, | |
| temperature: float = 0.2, | |
| ) -> tuple[str, dict]: | |
| """ | |
| Generate completion. | |
| Returns: (text, usage_dict) | |
| usage_dict keys: prompt_tokens, completion_tokens, total_tokens | |
| """ | |
| def model_name(self) -> str: | |
| """Human-readable model identifier for logging.""" | |
| # ββ Groq client (FREE β recommended) βββββββββββββββββββββββββββββββββββββββββ | |
| class GroqClient(LLMClient): | |
| """ | |
| Groq Cloud API β free tier. | |
| Best model for code: deepseek-r1-distill-llama-70b or | |
| llama-3.3-70b-versatile or deepseek-coder models. | |
| Free limits: 30 requests/min Β· 14,400 requests/day Β· 6,000 tokens/min | |
| Sign up: https://console.groq.com (no credit card required) | |
| Set env var: GROQ_API_KEY=gsk_... | |
| """ | |
| # Best free models for code generation on Groq (ranked by code quality) | |
| RECOMMENDED_MODELS = [ | |
| "deepseek-r1-distill-llama-70b", # DeepSeek R1 reasoning β best for bugs | |
| "llama-3.3-70b-versatile", # Llama 3.3 70B β excellent general code | |
| "llama-3.1-70b-versatile", # Llama 3.1 70B fallback | |
| ] | |
| def __init__(self, model: str = "deepseek-r1-distill-llama-70b"): | |
| self._model = model | |
| self._client = None | |
| def model_name(self) -> str: | |
| return f"groq/{self._model}" | |
| def _get_client(self): | |
| if self._client is None: | |
| try: | |
| from groq import Groq | |
| self._client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| except ImportError: | |
| raise ImportError("Install groq: pip install groq") | |
| return self._client | |
| def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]: | |
| client = self._get_client() | |
| start = time.monotonic() | |
| try: | |
| response = client.chat.completions.create( | |
| model=self._model, | |
| messages=[ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user}, | |
| ], | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| ) | |
| text = response.choices[0].message.content or "" | |
| usage = { | |
| "prompt_tokens": response.usage.prompt_tokens, | |
| "completion_tokens": response.usage.completion_tokens, | |
| "total_tokens": response.usage.total_tokens, | |
| } | |
| logger.info("Groq %s: %.1fs | %d tokens", self._model, time.monotonic() - start, usage["total_tokens"]) | |
| return text, usage | |
| except Exception as e: | |
| logger.warning("Groq error: %s", e) | |
| raise | |
| # ββ Google Gemini client (FREE) βββββββββββββββββββββββββββββββββββββββββββββββ | |
| class GeminiClient(LLMClient): | |
| """ | |
| Google Gemini API β free tier. | |
| gemini-1.5-flash: 15 RPM, 1,000,000 tokens/day β perfect for SWE-bench eval. | |
| gemini-1.5-pro: 2 RPM, 32,000 tokens/day (slower, use for hard cases). | |
| gemini-2.0-flash: latest, fast, generous free tier. | |
| Sign up: https://aistudio.google.com (no credit card required) | |
| Set env var: GEMINI_API_KEY=AIza... | |
| """ | |
| def __init__(self, model: str = "gemini-2.0-flash"): | |
| self._model = model | |
| self._genai = None | |
| def model_name(self) -> str: | |
| return f"gemini/{self._model}" | |
| def _get_client(self): | |
| if self._genai is None: | |
| try: | |
| import google.generativeai as genai | |
| genai.configure(api_key=os.environ.get("GEMINI_API_KEY")) | |
| self._genai = genai | |
| except ImportError: | |
| raise ImportError("Install: pip install google-generativeai") | |
| return self._genai | |
| def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]: | |
| genai = self._get_client() | |
| start = time.monotonic() | |
| try: | |
| model = genai.GenerativeModel( | |
| model_name=self._model, | |
| system_instruction=system, | |
| generation_config=genai.GenerationConfig( | |
| max_output_tokens=max_tokens, | |
| temperature=temperature, | |
| ) | |
| ) | |
| response = model.generate_content(user) | |
| text = response.text or "" | |
| # Gemini doesn't always return usage metadata in free tier | |
| prompt_tokens = getattr(getattr(response, "usage_metadata", None), "prompt_token_count", 0) or 0 | |
| completion_tokens = getattr(getattr(response, "usage_metadata", None), "candidates_token_count", 0) or 0 | |
| usage = { | |
| "prompt_tokens": prompt_tokens, | |
| "completion_tokens": completion_tokens, | |
| "total_tokens": prompt_tokens + completion_tokens, | |
| } | |
| logger.info("Gemini %s: %.1fs", self._model, time.monotonic() - start) | |
| return text, usage | |
| except Exception as e: | |
| logger.warning("Gemini error: %s", e) | |
| raise | |
| # ββ Ollama client (100% local, offline) ββββββββββββββββββββββββββββββββββββββ | |
| class OllamaClient(LLMClient): | |
| """ | |
| Ollama β run models 100% locally, no API key, no cost, no rate limits. | |
| Best model for code: deepseek-coder-v2:16b or deepseek-coder:33b | |
| Install: https://ollama.com | |
| Run: ollama pull deepseek-coder-v2:16b | |
| Required: Ollama server running at localhost:11434 | |
| """ | |
| def __init__(self, model: str = "deepseek-coder-v2:16b", base_url: str = "http://localhost:11434"): | |
| self._model = model | |
| self._base_url = base_url | |
| def model_name(self) -> str: | |
| return f"ollama/{self._model}" | |
| def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]: | |
| try: | |
| import requests | |
| except ImportError: | |
| raise ImportError("Install: pip install requests") | |
| start = time.monotonic() | |
| payload = { | |
| "model": self._model, | |
| "messages": [ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user}, | |
| ], | |
| "options": {"temperature": temperature, "num_predict": max_tokens}, | |
| "stream": False, | |
| } | |
| resp = requests.post(f"{self._base_url}/api/chat", json=payload, timeout=300) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| text = data.get("message", {}).get("content", "") | |
| total_tokens = data.get("eval_count", 0) + data.get("prompt_eval_count", 0) | |
| usage = { | |
| "prompt_tokens": data.get("prompt_eval_count", 0), | |
| "completion_tokens": data.get("eval_count", 0), | |
| "total_tokens": total_tokens, | |
| } | |
| logger.info("Ollama %s: %.1fs | %d tokens", self._model, time.monotonic() - start, total_tokens) | |
| return text, usage | |
| # ββ OpenAI client (paid, kept as optional fallback) βββββββββββββββββββββββββββ | |
| class OpenAIClient(LLMClient): | |
| """OpenAI client β kept as optional fallback if OPENAI_API_KEY is set.""" | |
| def __init__(self, model: str = "gpt-4o"): | |
| self._model = model | |
| self._client = None | |
| def model_name(self) -> str: | |
| return f"openai/{self._model}" | |
| def _get_client(self): | |
| if self._client is None: | |
| try: | |
| from openai import OpenAI | |
| self._client = OpenAI() | |
| except ImportError: | |
| raise ImportError("Install: pip install openai") | |
| return self._client | |
| def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]: | |
| client = self._get_client() | |
| start = time.monotonic() | |
| response = client.chat.completions.create( | |
| model=self._model, | |
| messages=[ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user}, | |
| ], | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| ) | |
| text = response.choices[0].message.content or "" | |
| usage = { | |
| "prompt_tokens": response.usage.prompt_tokens, | |
| "completion_tokens": response.usage.completion_tokens, | |
| "total_tokens": response.usage.total_tokens, | |
| } | |
| logger.info("OpenAI %s: %.1fs | %d tokens", self._model, time.monotonic() - start, usage["total_tokens"]) | |
| return text, usage | |
| # ββ Auto-detect factory ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_llm_client(provider: Optional[str] = None, model: Optional[str] = None) -> LLMClient: | |
| """ | |
| Auto-detect and return the best available free LLM client. | |
| Priority (set LLM_PROVIDER env var to override): | |
| groq β gemini β ollama β openai | |
| Args: | |
| provider: "groq" | "gemini" | "ollama" | "openai" | None (auto) | |
| model: model name override | |
| """ | |
| provider = provider or os.environ.get("LLM_PROVIDER", "auto") | |
| if provider == "auto": | |
| # Try each free provider in priority order | |
| if os.environ.get("GROQ_API_KEY"): | |
| provider = "groq" | |
| logger.info("Auto-selected provider: Groq (GROQ_API_KEY found)") | |
| elif os.environ.get("GEMINI_API_KEY"): | |
| provider = "gemini" | |
| logger.info("Auto-selected provider: Gemini (GEMINI_API_KEY found)") | |
| elif _ollama_available(): | |
| provider = "ollama" | |
| logger.info("Auto-selected provider: Ollama (local server detected)") | |
| elif os.environ.get("OPENAI_API_KEY"): | |
| provider = "openai" | |
| logger.info("Auto-selected provider: OpenAI (OPENAI_API_KEY found, note: paid)") | |
| else: | |
| raise EnvironmentError( | |
| "No LLM provider configured. Set one of:\n" | |
| " GROQ_API_KEY β free at https://console.groq.com\n" | |
| " GEMINI_API_KEY β free at https://aistudio.google.com\n" | |
| " Install Ollama β https://ollama.com (fully local, free)\n" | |
| " OPENAI_API_KEY β paid" | |
| ) | |
| clients = { | |
| "groq": lambda: GroqClient(model or "deepseek-r1-distill-llama-70b"), | |
| "gemini": lambda: GeminiClient(model or "gemini-2.0-flash"), | |
| "ollama": lambda: OllamaClient(model or "deepseek-coder-v2:16b"), | |
| "openai": lambda: OpenAIClient(model or "gpt-4o"), | |
| } | |
| if provider not in clients: | |
| raise ValueError(f"Unknown provider: {provider}. Choose from {list(clients)}") | |
| return clients[provider]() | |
| def _ollama_available() -> bool: | |
| """Check if Ollama server is running locally.""" | |
| try: | |
| import requests | |
| r = requests.get("http://localhost:11434/api/tags", timeout=1) | |
| return r.status_code == 200 | |
| except Exception: | |
| return False | |