SouravNath commited on
Commit
84fad73
Β·
1 Parent(s): e76b46b

fix: auto-load .env in llm_client; add SWEBenchLoader class to loader

Browse files
Files changed (2) hide show
  1. agent/llm_client.py +343 -0
  2. swe_bench/loader.py +41 -0
agent/llm_client.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ agent/llm_client.py
3
+ ────────────────────
4
+ Provider-agnostic LLM client with automatic fallback chain.
5
+
6
+ Free provider priority order (best quality β†’ fastest):
7
+ 1. Groq API β€” free tier, DeepSeek-Coder-33B, ~500 tok/s
8
+ 2. Google Gemini β€” free tier, 1M context, 15 RPM
9
+ 3. Ollama (local) β€” fully offline, DeepSeek-Coder-7B/33B
10
+ 4. HuggingFace TGI β€” free inference API
11
+ 5. OpenAI β€” paid fallback (only if key is set)
12
+
13
+ Why Groq over GPT-4o for this project:
14
+ - DeepSeek-Coder-33B-Instruct scores HIGHER than GPT-4o on HumanEval
15
+ (79.3% vs 67.0%), EvalPlus, and LiveCodeBench for code tasks
16
+ - Inference is 10Γ— faster (~500 tok/s vs ~50 tok/s)
17
+ - Free tier: 30 RPM, 14,400 RPD, 6,000 tokens/min
18
+ - This is a QUALITY UPGRADE, not just a cost-cutting measure
19
+
20
+ Usage:
21
+ from agent.llm_client import get_llm_client
22
+ client = get_llm_client() # auto-detects from environment
23
+ patch = client.complete(system=SYSTEM_PROMPT, user=ISSUE_TEXT)
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import logging
28
+ import os
29
+ import time
30
+ from abc import ABC, abstractmethod
31
+ from typing import Optional
32
+
33
+ # Auto-load .env so scripts work without manually exporting env vars
34
+ try:
35
+ from dotenv import load_dotenv
36
+ load_dotenv()
37
+ except ImportError:
38
+ pass
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ # ── Base interface ────────────────────────────────────────────────────────────
44
+
45
+ class LLMClient(ABC):
46
+ """Provider-agnostic LLM interface."""
47
+
48
+ @abstractmethod
49
+ def complete(
50
+ self,
51
+ system: str,
52
+ user: str,
53
+ max_tokens: int = 4096,
54
+ temperature: float = 0.2,
55
+ ) -> tuple[str, dict]:
56
+ """
57
+ Generate completion.
58
+ Returns: (text, usage_dict)
59
+ usage_dict keys: prompt_tokens, completion_tokens, total_tokens
60
+ """
61
+
62
+ @property
63
+ @abstractmethod
64
+ def model_name(self) -> str:
65
+ """Human-readable model identifier for logging."""
66
+
67
+
68
+ # ── Groq client (FREE β€” recommended) ─────────────────────────────────────────
69
+
70
+ class GroqClient(LLMClient):
71
+ """
72
+ Groq Cloud API β€” free tier.
73
+ Best model for code: deepseek-r1-distill-llama-70b or
74
+ llama-3.3-70b-versatile or deepseek-coder models.
75
+
76
+ Free limits: 30 requests/min Β· 14,400 requests/day Β· 6,000 tokens/min
77
+ Sign up: https://console.groq.com (no credit card required)
78
+
79
+ Set env var: GROQ_API_KEY=gsk_...
80
+ """
81
+
82
+ # Best free models for code generation on Groq (ranked by code quality)
83
+ RECOMMENDED_MODELS = [
84
+ "deepseek-r1-distill-llama-70b", # DeepSeek R1 reasoning β€” best for bugs
85
+ "llama-3.3-70b-versatile", # Llama 3.3 70B β€” excellent general code
86
+ "llama-3.1-70b-versatile", # Llama 3.1 70B fallback
87
+ ]
88
+
89
+ def __init__(self, model: str = "deepseek-r1-distill-llama-70b"):
90
+ self._model = model
91
+ self._client = None
92
+
93
+ @property
94
+ def model_name(self) -> str:
95
+ return f"groq/{self._model}"
96
+
97
+ def _get_client(self):
98
+ if self._client is None:
99
+ try:
100
+ from groq import Groq
101
+ self._client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
102
+ except ImportError:
103
+ raise ImportError("Install groq: pip install groq")
104
+ return self._client
105
+
106
+ def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]:
107
+ client = self._get_client()
108
+ start = time.monotonic()
109
+ try:
110
+ response = client.chat.completions.create(
111
+ model=self._model,
112
+ messages=[
113
+ {"role": "system", "content": system},
114
+ {"role": "user", "content": user},
115
+ ],
116
+ max_tokens=max_tokens,
117
+ temperature=temperature,
118
+ )
119
+ text = response.choices[0].message.content or ""
120
+ usage = {
121
+ "prompt_tokens": response.usage.prompt_tokens,
122
+ "completion_tokens": response.usage.completion_tokens,
123
+ "total_tokens": response.usage.total_tokens,
124
+ }
125
+ logger.info("Groq %s: %.1fs | %d tokens", self._model, time.monotonic() - start, usage["total_tokens"])
126
+ return text, usage
127
+ except Exception as e:
128
+ logger.warning("Groq error: %s", e)
129
+ raise
130
+
131
+
132
+ # ── Google Gemini client (FREE) ───────────────────────────────────────────────
133
+
134
+ class GeminiClient(LLMClient):
135
+ """
136
+ Google Gemini API β€” free tier.
137
+ gemini-1.5-flash: 15 RPM, 1,000,000 tokens/day β€” perfect for SWE-bench eval.
138
+ gemini-1.5-pro: 2 RPM, 32,000 tokens/day (slower, use for hard cases).
139
+ gemini-2.0-flash: latest, fast, generous free tier.
140
+
141
+ Sign up: https://aistudio.google.com (no credit card required)
142
+ Set env var: GEMINI_API_KEY=AIza...
143
+ """
144
+
145
+ def __init__(self, model: str = "gemini-2.0-flash"):
146
+ self._model = model
147
+ self._genai = None
148
+
149
+ @property
150
+ def model_name(self) -> str:
151
+ return f"gemini/{self._model}"
152
+
153
+ def _get_client(self):
154
+ if self._genai is None:
155
+ try:
156
+ import google.generativeai as genai
157
+ genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
158
+ self._genai = genai
159
+ except ImportError:
160
+ raise ImportError("Install: pip install google-generativeai")
161
+ return self._genai
162
+
163
+ def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]:
164
+ genai = self._get_client()
165
+ start = time.monotonic()
166
+ try:
167
+ model = genai.GenerativeModel(
168
+ model_name=self._model,
169
+ system_instruction=system,
170
+ generation_config=genai.GenerationConfig(
171
+ max_output_tokens=max_tokens,
172
+ temperature=temperature,
173
+ )
174
+ )
175
+ response = model.generate_content(user)
176
+ text = response.text or ""
177
+ # Gemini doesn't always return usage metadata in free tier
178
+ prompt_tokens = getattr(getattr(response, "usage_metadata", None), "prompt_token_count", 0) or 0
179
+ completion_tokens = getattr(getattr(response, "usage_metadata", None), "candidates_token_count", 0) or 0
180
+ usage = {
181
+ "prompt_tokens": prompt_tokens,
182
+ "completion_tokens": completion_tokens,
183
+ "total_tokens": prompt_tokens + completion_tokens,
184
+ }
185
+ logger.info("Gemini %s: %.1fs", self._model, time.monotonic() - start)
186
+ return text, usage
187
+ except Exception as e:
188
+ logger.warning("Gemini error: %s", e)
189
+ raise
190
+
191
+
192
+ # ── Ollama client (100% local, offline) ──────────────────────────────────────
193
+
194
+ class OllamaClient(LLMClient):
195
+ """
196
+ Ollama β€” run models 100% locally, no API key, no cost, no rate limits.
197
+ Best model for code: deepseek-coder-v2:16b or deepseek-coder:33b
198
+ Install: https://ollama.com
199
+ Run: ollama pull deepseek-coder-v2:16b
200
+
201
+ Required: Ollama server running at localhost:11434
202
+ """
203
+
204
+ def __init__(self, model: str = "deepseek-coder-v2:16b", base_url: str = "http://localhost:11434"):
205
+ self._model = model
206
+ self._base_url = base_url
207
+
208
+ @property
209
+ def model_name(self) -> str:
210
+ return f"ollama/{self._model}"
211
+
212
+ def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]:
213
+ try:
214
+ import requests
215
+ except ImportError:
216
+ raise ImportError("Install: pip install requests")
217
+
218
+ start = time.monotonic()
219
+ payload = {
220
+ "model": self._model,
221
+ "messages": [
222
+ {"role": "system", "content": system},
223
+ {"role": "user", "content": user},
224
+ ],
225
+ "options": {"temperature": temperature, "num_predict": max_tokens},
226
+ "stream": False,
227
+ }
228
+ resp = requests.post(f"{self._base_url}/api/chat", json=payload, timeout=300)
229
+ resp.raise_for_status()
230
+ data = resp.json()
231
+ text = data.get("message", {}).get("content", "")
232
+ total_tokens = data.get("eval_count", 0) + data.get("prompt_eval_count", 0)
233
+ usage = {
234
+ "prompt_tokens": data.get("prompt_eval_count", 0),
235
+ "completion_tokens": data.get("eval_count", 0),
236
+ "total_tokens": total_tokens,
237
+ }
238
+ logger.info("Ollama %s: %.1fs | %d tokens", self._model, time.monotonic() - start, total_tokens)
239
+ return text, usage
240
+
241
+
242
+ # ── OpenAI client (paid, kept as optional fallback) ───────────────────────────
243
+
244
+ class OpenAIClient(LLMClient):
245
+ """OpenAI client β€” kept as optional fallback if OPENAI_API_KEY is set."""
246
+
247
+ def __init__(self, model: str = "gpt-4o"):
248
+ self._model = model
249
+ self._client = None
250
+
251
+ @property
252
+ def model_name(self) -> str:
253
+ return f"openai/{self._model}"
254
+
255
+ def _get_client(self):
256
+ if self._client is None:
257
+ try:
258
+ from openai import OpenAI
259
+ self._client = OpenAI()
260
+ except ImportError:
261
+ raise ImportError("Install: pip install openai")
262
+ return self._client
263
+
264
+ def complete(self, system: str, user: str, max_tokens: int = 4096, temperature: float = 0.2) -> tuple[str, dict]:
265
+ client = self._get_client()
266
+ start = time.monotonic()
267
+ response = client.chat.completions.create(
268
+ model=self._model,
269
+ messages=[
270
+ {"role": "system", "content": system},
271
+ {"role": "user", "content": user},
272
+ ],
273
+ max_tokens=max_tokens,
274
+ temperature=temperature,
275
+ )
276
+ text = response.choices[0].message.content or ""
277
+ usage = {
278
+ "prompt_tokens": response.usage.prompt_tokens,
279
+ "completion_tokens": response.usage.completion_tokens,
280
+ "total_tokens": response.usage.total_tokens,
281
+ }
282
+ logger.info("OpenAI %s: %.1fs | %d tokens", self._model, time.monotonic() - start, usage["total_tokens"])
283
+ return text, usage
284
+
285
+
286
+ # ── Auto-detect factory ────────────────────────────────────────────────────────
287
+
288
+ def get_llm_client(provider: Optional[str] = None, model: Optional[str] = None) -> LLMClient:
289
+ """
290
+ Auto-detect and return the best available free LLM client.
291
+
292
+ Priority (set LLM_PROVIDER env var to override):
293
+ groq β†’ gemini β†’ ollama β†’ openai
294
+
295
+ Args:
296
+ provider: "groq" | "gemini" | "ollama" | "openai" | None (auto)
297
+ model: model name override
298
+ """
299
+ provider = provider or os.environ.get("LLM_PROVIDER", "auto")
300
+
301
+ if provider == "auto":
302
+ # Try each free provider in priority order
303
+ if os.environ.get("GROQ_API_KEY"):
304
+ provider = "groq"
305
+ logger.info("Auto-selected provider: Groq (GROQ_API_KEY found)")
306
+ elif os.environ.get("GEMINI_API_KEY"):
307
+ provider = "gemini"
308
+ logger.info("Auto-selected provider: Gemini (GEMINI_API_KEY found)")
309
+ elif _ollama_available():
310
+ provider = "ollama"
311
+ logger.info("Auto-selected provider: Ollama (local server detected)")
312
+ elif os.environ.get("OPENAI_API_KEY"):
313
+ provider = "openai"
314
+ logger.info("Auto-selected provider: OpenAI (OPENAI_API_KEY found, note: paid)")
315
+ else:
316
+ raise EnvironmentError(
317
+ "No LLM provider configured. Set one of:\n"
318
+ " GROQ_API_KEY β€” free at https://console.groq.com\n"
319
+ " GEMINI_API_KEY β€” free at https://aistudio.google.com\n"
320
+ " Install Ollama β€” https://ollama.com (fully local, free)\n"
321
+ " OPENAI_API_KEY β€” paid"
322
+ )
323
+
324
+ clients = {
325
+ "groq": lambda: GroqClient(model or "deepseek-r1-distill-llama-70b"),
326
+ "gemini": lambda: GeminiClient(model or "gemini-2.0-flash"),
327
+ "ollama": lambda: OllamaClient(model or "deepseek-coder-v2:16b"),
328
+ "openai": lambda: OpenAIClient(model or "gpt-4o"),
329
+ }
330
+ if provider not in clients:
331
+ raise ValueError(f"Unknown provider: {provider}. Choose from {list(clients)}")
332
+
333
+ return clients[provider]()
334
+
335
+
336
+ def _ollama_available() -> bool:
337
+ """Check if Ollama server is running locally."""
338
+ try:
339
+ import requests
340
+ r = requests.get("http://localhost:11434/api/tags", timeout=1)
341
+ return r.status_code == 200
342
+ except Exception:
343
+ return False
swe_bench/loader.py CHANGED
@@ -170,3 +170,44 @@ def _parse_list(value: str | list) -> list[str]:
170
  return parsed if isinstance(parsed, list) else []
171
  except (json.JSONDecodeError, TypeError):
172
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  return parsed if isinstance(parsed, list) else []
171
  except (json.JSONDecodeError, TypeError):
172
  return []
173
+
174
+
175
+ # ── Convenience class (used by experiments/benchmark.py) ─────────────────────
176
+
177
+ class SWEBenchLoader:
178
+ """
179
+ Class wrapper around load_swebench_lite() for use in the benchmark harness.
180
+
181
+ Usage:
182
+ loader = SWEBenchLoader()
183
+ instances = loader.load(split="test", max_instances=10)
184
+ """
185
+
186
+ def __init__(
187
+ self,
188
+ dataset_name: str = "princeton-nlp/SWE-bench_Lite",
189
+ cache_dir: Path | None = Path(".cache/swebench"),
190
+ ):
191
+ self.dataset_name = dataset_name
192
+ self.cache_dir = cache_dir
193
+
194
+ def load(
195
+ self,
196
+ split: str = "test",
197
+ max_instances: int | None = None,
198
+ instance_ids: list[str] | None = None,
199
+ ) -> list[dict]:
200
+ """
201
+ Load instances and return as plain dicts (benchmark-friendly format).
202
+ Keys: instance_id, repo, base_commit, problem_statement,
203
+ FAIL_TO_PASS, PASS_TO_PASS, patch.
204
+ """
205
+ instances = load_swebench_lite(
206
+ dataset_name=self.dataset_name,
207
+ split=split,
208
+ max_instances=max_instances,
209
+ instance_ids=instance_ids,
210
+ cache_dir=self.cache_dir,
211
+ )
212
+ return [_instance_to_dict(i) for i in instances]
213
+