repomind-api / agent /llm_client.py
SouravNath's picture
fix: auto-load .env in llm_client; add SWEBenchLoader class to loader
84fad73
"""
agent/llm_client.py
────────────────────
Provider-agnostic LLM client with automatic fallback chain.
Free provider priority order (best quality β†’ fastest):
1. Groq API β€” free tier, DeepSeek-Coder-33B, ~500 tok/s
2. Google Gemini β€” free tier, 1M context, 15 RPM
3. Ollama (local) β€” fully offline, DeepSeek-Coder-7B/33B
4. HuggingFace TGI β€” free inference API
5. OpenAI β€” paid fallback (only if key is set)
Why Groq over GPT-4o for this project:
- DeepSeek-Coder-33B-Instruct scores HIGHER than GPT-4o on HumanEval
(79.3% vs 67.0%), EvalPlus, and LiveCodeBench for code tasks
- Inference is 10Γ— faster (~500 tok/s vs ~50 tok/s)
- Free tier: 30 RPM, 14,400 RPD, 6,000 tokens/min
- This is a QUALITY UPGRADE, not just a cost-cutting measure
Usage:
from agent.llm_client import get_llm_client
client = get_llm_client() # auto-detects from environment
patch = client.complete(system=SYSTEM_PROMPT, user=ISSUE_TEXT)
"""
from __future__ import annotations
import logging
import os
import time
from abc import ABC, abstractmethod
from typing import Optional
# Auto-load .env so scripts work without manually exporting env vars
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass
logger = logging.getLogger(__name__)
# ── Base interface ────────────────────────────────────────────────────────────
class LLMClient(ABC):
"""Provider-agnostic LLM interface."""
@abstractmethod
def complete(
self,
system: str,
user: str,
max_tokens: int = 4096,
temperature: float = 0.2,
) -> tuple[str, dict]:
"""
Generate completion.
Returns: (text, usage_dict)
usage_dict keys: prompt_tokens, completion_tokens, total_tokens
"""
@property
@abstractmethod
def model_name(self) -> str:
"""Human-readable model identifier for logging."""
# ── Groq client (FREE β€” recommended) ─────────────────────────────────────────
class GroqClient(LLMClient):
"""
Groq Cloud API β€” free tier.
Best model for code: deepseek-r1-distill-llama-70b or
llama-3.3-70b-versatile or deepseek-coder models.
Free limits: 30 requests/min Β· 14,400 requests/day Β· 6,000 tokens/min
Sign up: https://console.groq.com (no credit card required)
Set env var: GROQ_API_KEY=gsk_...
"""
# Best free models for code generation on Groq (ranked by code quality)
RECOMMENDED_MODELS = [
"deepseek-r1-distill-llama-70b", # DeepSeek R1 reasoning β€” best for bugs
"llama-3.3-70b-versatile", # Llama 3.3 70B β€” excellent general code
"llama-3.1-70b-versatile", # Llama 3.1 70B fallback
]
def __init__(self, model: str = "deepseek-r1-distill-llama-70b"):
self._model = model
self._client = None
@property
def model_name(self) -> str:
return f"groq/{self._model}"
def _get_client(self):
if self._client is None:
try:
from groq import Groq
self._client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
except ImportError:
raise ImportError("Install groq: pip install groq")
return self._client
def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]:
client = self._get_client()
start = time.monotonic()
try:
response = client.chat.completions.create(
model=self._model,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
max_tokens=max_tokens,
temperature=temperature,
)
text = response.choices[0].message.content or ""
usage = {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens,
}
logger.info("Groq %s: %.1fs | %d tokens", self._model, time.monotonic() - start, usage["total_tokens"])
return text, usage
except Exception as e:
logger.warning("Groq error: %s", e)
raise
# ── Google Gemini client (FREE) ───────────────────────────────────────────────
class GeminiClient(LLMClient):
"""
Google Gemini API β€” free tier.
gemini-1.5-flash: 15 RPM, 1,000,000 tokens/day β€” perfect for SWE-bench eval.
gemini-1.5-pro: 2 RPM, 32,000 tokens/day (slower, use for hard cases).
gemini-2.0-flash: latest, fast, generous free tier.
Sign up: https://aistudio.google.com (no credit card required)
Set env var: GEMINI_API_KEY=AIza...
"""
def __init__(self, model: str = "gemini-2.0-flash"):
self._model = model
self._genai = None
@property
def model_name(self) -> str:
return f"gemini/{self._model}"
def _get_client(self):
if self._genai is None:
try:
import google.generativeai as genai
genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
self._genai = genai
except ImportError:
raise ImportError("Install: pip install google-generativeai")
return self._genai
def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]:
genai = self._get_client()
start = time.monotonic()
try:
model = genai.GenerativeModel(
model_name=self._model,
system_instruction=system,
generation_config=genai.GenerationConfig(
max_output_tokens=max_tokens,
temperature=temperature,
)
)
response = model.generate_content(user)
text = response.text or ""
# Gemini doesn't always return usage metadata in free tier
prompt_tokens = getattr(getattr(response, "usage_metadata", None), "prompt_token_count", 0) or 0
completion_tokens = getattr(getattr(response, "usage_metadata", None), "candidates_token_count", 0) or 0
usage = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
}
logger.info("Gemini %s: %.1fs", self._model, time.monotonic() - start)
return text, usage
except Exception as e:
logger.warning("Gemini error: %s", e)
raise
# ── Ollama client (100% local, offline) ──────────────────────────────────────
class OllamaClient(LLMClient):
"""
Ollama β€” run models 100% locally, no API key, no cost, no rate limits.
Best model for code: deepseek-coder-v2:16b or deepseek-coder:33b
Install: https://ollama.com
Run: ollama pull deepseek-coder-v2:16b
Required: Ollama server running at localhost:11434
"""
def __init__(self, model: str = "deepseek-coder-v2:16b", base_url: str = "http://localhost:11434"):
self._model = model
self._base_url = base_url
@property
def model_name(self) -> str:
return f"ollama/{self._model}"
def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]:
try:
import requests
except ImportError:
raise ImportError("Install: pip install requests")
start = time.monotonic()
payload = {
"model": self._model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"options": {"temperature": temperature, "num_predict": max_tokens},
"stream": False,
}
resp = requests.post(f"{self._base_url}/api/chat", json=payload, timeout=300)
resp.raise_for_status()
data = resp.json()
text = data.get("message", {}).get("content", "")
total_tokens = data.get("eval_count", 0) + data.get("prompt_eval_count", 0)
usage = {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
"total_tokens": total_tokens,
}
logger.info("Ollama %s: %.1fs | %d tokens", self._model, time.monotonic() - start, total_tokens)
return text, usage
# ── OpenAI client (paid, kept as optional fallback) ───────────────────────────
class OpenAIClient(LLMClient):
"""OpenAI client β€” kept as optional fallback if OPENAI_API_KEY is set."""
def __init__(self, model: str = "gpt-4o"):
self._model = model
self._client = None
@property
def model_name(self) -> str:
return f"openai/{self._model}"
def _get_client(self):
if self._client is None:
try:
from openai import OpenAI
self._client = OpenAI()
except ImportError:
raise ImportError("Install: pip install openai")
return self._client
def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]:
client = self._get_client()
start = time.monotonic()
response = client.chat.completions.create(
model=self._model,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
max_tokens=max_tokens,
temperature=temperature,
)
text = response.choices[0].message.content or ""
usage = {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens,
}
logger.info("OpenAI %s: %.1fs | %d tokens", self._model, time.monotonic() - start, usage["total_tokens"])
return text, usage
# ── Auto-detect factory ────────────────────────────────────────────────────────
def get_llm_client(provider: Optional[str] = None, model: Optional[str] = None) -> LLMClient:
"""
Auto-detect and return the best available free LLM client.
Priority (set LLM_PROVIDER env var to override):
groq β†’ gemini β†’ ollama β†’ openai
Args:
provider: "groq" | "gemini" | "ollama" | "openai" | None (auto)
model: model name override
"""
provider = provider or os.environ.get("LLM_PROVIDER", "auto")
if provider == "auto":
# Try each free provider in priority order
if os.environ.get("GROQ_API_KEY"):
provider = "groq"
logger.info("Auto-selected provider: Groq (GROQ_API_KEY found)")
elif os.environ.get("GEMINI_API_KEY"):
provider = "gemini"
logger.info("Auto-selected provider: Gemini (GEMINI_API_KEY found)")
elif _ollama_available():
provider = "ollama"
logger.info("Auto-selected provider: Ollama (local server detected)")
elif os.environ.get("OPENAI_API_KEY"):
provider = "openai"
logger.info("Auto-selected provider: OpenAI (OPENAI_API_KEY found, note: paid)")
else:
raise EnvironmentError(
"No LLM provider configured. Set one of:\n"
" GROQ_API_KEY β€” free at https://console.groq.com\n"
" GEMINI_API_KEY β€” free at https://aistudio.google.com\n"
" Install Ollama β€” https://ollama.com (fully local, free)\n"
" OPENAI_API_KEY β€” paid"
)
clients = {
"groq": lambda: GroqClient(model or "deepseek-r1-distill-llama-70b"),
"gemini": lambda: GeminiClient(model or "gemini-2.0-flash"),
"ollama": lambda: OllamaClient(model or "deepseek-coder-v2:16b"),
"openai": lambda: OpenAIClient(model or "gpt-4o"),
}
if provider not in clients:
raise ValueError(f"Unknown provider: {provider}. Choose from {list(clients)}")
return clients[provider]()
def _ollama_available() -> bool:
"""Check if Ollama server is running locally."""
try:
import requests
r = requests.get("http://localhost:11434/api/tags", timeout=1)
return r.status_code == 200
except Exception:
return False