vn6295337 Claude Opus 4.5 commited on
Commit
494ea7a
Β·
1 Parent(s): 1ad22ef

Rotate between LLM providers instead of consecutive retries

Browse files

Before: Groq β†’ Groq β†’ Groq β†’ Gemini β†’ Gemini β†’ Gemini
After: Groq β†’ Gemini β†’ OpenRouter β†’ Groq β†’ Gemini β†’ OpenRouter

- 3 rounds of rotation through all providers
- 10s delay between each attempt
- Gives each provider time to recover from rate limits

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/llm_client.py +55 -77
src/llm_client.py CHANGED
@@ -6,12 +6,11 @@ Adopts pattern from Enterprise-AI-Gateway for resilient LLM access.
6
  import os
7
  import time
8
  import requests
9
- from requests.exceptions import HTTPError
10
  from typing import Optional, Tuple
11
 
12
- # Retry configuration for rate limits
13
- MAX_RETRIES = 3
14
- INITIAL_BACKOFF = 10 # seconds (backoffs: 10s, 20s, 40s)
15
 
16
 
17
  class LLMClient:
@@ -60,7 +59,10 @@ class LLMClient:
60
 
61
  def query(self, prompt: str, temperature: float = 0, max_tokens: int = 2048) -> Tuple[Optional[str], Optional[str], Optional[str], list]:
62
  """
63
- Query LLM with cascading fallback across providers.
 
 
 
64
 
65
  Returns:
66
  Tuple of (response_content, provider_used, error_message, providers_failed)
@@ -68,77 +70,53 @@ class LLMClient:
68
  """
69
  errors = []
70
  providers_failed = []
71
- last_was_rate_limited = False
72
-
73
- for provider in self.providers:
74
- # Add delay before trying next provider if previous one was rate limited
75
- if last_was_rate_limited:
76
- print(f"Waiting 10s before trying {provider['name']} (rate limit cooldown)...")
77
- time.sleep(10)
78
- last_was_rate_limited = False
79
-
80
- print(f"Attempting LLM call with {provider['name']}...")
81
- start_time = time.perf_counter()
82
-
83
- try:
84
- content, error = self._call_provider(
85
- provider=provider,
86
- prompt=prompt,
87
- temperature=temperature,
88
- max_tokens=max_tokens
89
- )
90
- latency_ms = int((time.perf_counter() - start_time) * 1000)
91
-
92
- if content:
93
- print(f"Success with {provider['name']} ({latency_ms}ms)")
94
- # Return provider:model format for detailed logging
95
- provider_info = f"{provider['name']}:{provider['model']}"
96
- return content, provider_info, None, providers_failed
97
- else:
98
- errors.append(f"{provider['name']}: {error}")
99
- providers_failed.append({"name": provider['name'], "error": error})
100
- print(f"Provider {provider['name']} failed: {error}")
101
- # Always delay before next provider fallback
102
- last_was_rate_limited = True
103
-
104
- except Exception as e:
105
- errors.append(f"{provider['name']}: {str(e)}")
106
- providers_failed.append({"name": provider['name'], "error": str(e)})
107
- print(f"Provider {provider['name']} exception: {e}")
108
- # Always delay before next provider fallback
109
- last_was_rate_limited = True
110
-
111
- return None, None, f"All LLM providers failed: {'; '.join(errors)}", providers_failed
112
-
113
- def _request_with_retry(self, url: str, headers: dict, payload: dict, provider_name: str) -> requests.Response:
114
- """Make HTTP request with exponential backoff retry on 429 rate limit."""
115
- last_error = None
116
-
117
- for attempt in range(MAX_RETRIES):
118
- try:
119
- response = requests.post(url, headers=headers, json=payload, timeout=30)
120
- response.raise_for_status()
121
- return response
122
- except HTTPError as e:
123
- if e.response is not None and e.response.status_code == 429:
124
- last_error = e
125
- if attempt < MAX_RETRIES - 1:
126
- backoff = INITIAL_BACKOFF * (2 ** attempt) # 5s, 10s, 20s
127
- print(f"Rate limited by {provider_name}, retrying in {backoff}s (attempt {attempt + 1}/{MAX_RETRIES})...")
128
- time.sleep(backoff)
129
- continue
130
- # Re-raise non-429 errors or final 429
131
- raise
132
- except Exception:
133
- raise
134
-
135
- # Should not reach here, but just in case
136
- if last_error:
137
- raise last_error
138
- raise Exception(f"Request failed after {MAX_RETRIES} attempts")
139
 
140
  def _call_provider(self, provider: dict, prompt: str, temperature: float, max_tokens: int) -> Tuple[Optional[str], Optional[str]]:
141
- """Call a specific LLM provider with retry on rate limit."""
142
  headers = {"Content-Type": "application/json"}
143
 
144
  if provider["name"] == "groq":
@@ -149,7 +127,7 @@ class LLMClient:
149
  "max_tokens": max_tokens,
150
  "temperature": temperature,
151
  }
152
- response = self._request_with_retry(provider["url"], headers, payload, provider["name"])
153
  data = response.json()
154
  if data and "choices" in data and data["choices"]:
155
  return data["choices"][0]["message"]["content"], None
@@ -164,7 +142,7 @@ class LLMClient:
164
  "maxOutputTokens": max_tokens,
165
  }
166
  }
167
- response = self._request_with_retry(url, headers, payload, provider["name"])
168
  data = response.json()
169
  if data and "candidates" in data and data["candidates"]:
170
  first_candidate = data["candidates"][0]
@@ -184,7 +162,7 @@ class LLMClient:
184
  "max_tokens": max_tokens,
185
  "temperature": temperature,
186
  }
187
- response = self._request_with_retry(provider["url"], headers, payload, provider["name"])
188
  data = response.json()
189
  if data and "choices" in data and data["choices"]:
190
  return data["choices"][0]["message"]["content"], None
 
6
  import os
7
  import time
8
  import requests
 
9
  from typing import Optional, Tuple
10
 
11
+ # Retry configuration - rotate through providers instead of consecutive retries
12
+ MAX_ROUNDS = 3 # Number of times to cycle through all providers
13
+ PROVIDER_DELAY = 10 # seconds between provider attempts
14
 
15
 
16
  class LLMClient:
 
59
 
60
  def query(self, prompt: str, temperature: float = 0, max_tokens: int = 2048) -> Tuple[Optional[str], Optional[str], Optional[str], list]:
61
  """
62
+ Query LLM with rotating fallback across providers.
63
+
64
+ Instead of retrying same provider consecutively, rotates:
65
+ Groq β†’ Gemini β†’ OpenRouter β†’ Groq β†’ Gemini β†’ OpenRouter β†’ ...
66
 
67
  Returns:
68
  Tuple of (response_content, provider_used, error_message, providers_failed)
 
70
  """
71
  errors = []
72
  providers_failed = []
73
+ is_first_attempt = True
74
+
75
+ # Rotate through providers for MAX_ROUNDS cycles
76
+ for round_num in range(MAX_ROUNDS):
77
+ for provider in self.providers:
78
+ # Add delay between attempts (skip first attempt)
79
+ if not is_first_attempt:
80
+ print(f"Waiting {PROVIDER_DELAY}s before trying {provider['name']} (round {round_num + 1})...")
81
+ time.sleep(PROVIDER_DELAY)
82
+ is_first_attempt = False
83
+
84
+ print(f"Attempting LLM call with {provider['name']} (round {round_num + 1}/{MAX_ROUNDS})...")
85
+ start_time = time.perf_counter()
86
+
87
+ try:
88
+ content, error = self._call_provider(
89
+ provider=provider,
90
+ prompt=prompt,
91
+ temperature=temperature,
92
+ max_tokens=max_tokens
93
+ )
94
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
95
+
96
+ if content:
97
+ print(f"Success with {provider['name']} ({latency_ms}ms)")
98
+ provider_info = f"{provider['name']}:{provider['model']}"
99
+ return content, provider_info, None, providers_failed
100
+ else:
101
+ errors.append(f"{provider['name']}: {error}")
102
+ providers_failed.append({"name": provider['name'], "error": error})
103
+ print(f"Provider {provider['name']} failed: {error}")
104
+
105
+ except Exception as e:
106
+ errors.append(f"{provider['name']}: {str(e)}")
107
+ providers_failed.append({"name": provider['name'], "error": str(e)})
108
+ print(f"Provider {provider['name']} exception: {e}")
109
+
110
+ return None, None, f"All LLM providers failed after {MAX_ROUNDS} rounds: {'; '.join(errors)}", providers_failed
111
+
112
+ def _make_request(self, url: str, headers: dict, payload: dict, provider_name: str) -> requests.Response:
113
+ """Make HTTP request to provider (no internal retry - rotation handles retries)."""
114
+ response = requests.post(url, headers=headers, json=payload, timeout=30)
115
+ response.raise_for_status()
116
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def _call_provider(self, provider: dict, prompt: str, temperature: float, max_tokens: int) -> Tuple[Optional[str], Optional[str]]:
119
+ """Call a specific LLM provider."""
120
  headers = {"Content-Type": "application/json"}
121
 
122
  if provider["name"] == "groq":
 
127
  "max_tokens": max_tokens,
128
  "temperature": temperature,
129
  }
130
+ response = self._make_request(provider["url"], headers, payload, provider["name"])
131
  data = response.json()
132
  if data and "choices" in data and data["choices"]:
133
  return data["choices"][0]["message"]["content"], None
 
142
  "maxOutputTokens": max_tokens,
143
  }
144
  }
145
+ response = self._make_request(url, headers, payload, provider["name"])
146
  data = response.json()
147
  if data and "candidates" in data and data["candidates"]:
148
  first_candidate = data["candidates"][0]
 
162
  "max_tokens": max_tokens,
163
  "temperature": temperature,
164
  }
165
+ response = self._make_request(provider["url"], headers, payload, provider["name"])
166
  data = response.json()
167
  if data and "choices" in data and data["choices"]:
168
  return data["choices"][0]["message"]["content"], None