Aksel Joonas Reedi Claude Opus 4.7 (1M context) commited on
Commit
182ddee
Β·
unverified Β·
1 Parent(s): d0a9a6f

Fall back to user HF token for router when INFERENCE_TOKEN is unset (#39)

Browse files

The HF router code path only read INFERENCE_TOKEN, which is the shared
server-side key set on the hosted Space so inference is free for users.
On the CLI / self-hosted path that env var is absent, so requests went
out with no bearer token and the router returned 401 β€” surfaced to users
as "Authentication failed" even with a valid HF_TOKEN (issue #36).

Resolve api_key in this order:
1. INFERENCE_TOKEN env (unchanged Space behavior β€” shared billing)
2. session.hf_token (user's OAuth / CLI token)
3. HF_TOKEN env (belt-and-suspenders for CLI)

Applied to _resolve_hf_router_params, research_tool._resolve_llm_params,
and ContextManager.compact.

Fixes #36

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent/context_manager/manager.py CHANGED
@@ -263,7 +263,10 @@ class ContextManager:
263
  return False
264
 
265
  async def compact(
266
- self, model_name: str, tool_specs: list[dict] | None = None
 
 
 
267
  ) -> None:
268
  """Remove old messages to keep history under target size"""
269
  if (self.context_length <= self.max_context) or not self.items:
@@ -303,7 +306,11 @@ class ContextManager:
303
  )
304
  )
305
 
306
- hf_key = os.environ.get("INFERENCE_TOKEN")
 
 
 
 
307
  response = await acompletion(
308
  model=model_name,
309
  messages=messages_to_summarize,
 
263
  return False
264
 
265
  async def compact(
266
+ self,
267
+ model_name: str,
268
+ tool_specs: list[dict] | None = None,
269
+ hf_token: str | None = None,
270
  ) -> None:
271
  """Remove old messages to keep history under target size"""
272
  if (self.context_length <= self.max_context) or not self.items:
 
306
  )
307
  )
308
 
309
+ hf_key = (
310
+ os.environ.get("INFERENCE_TOKEN")
311
+ or hf_token
312
+ or os.environ.get("HF_TOKEN")
313
+ )
314
  response = await acompletion(
315
  model=model_name,
316
  messages=messages_to_summarize,
agent/core/agent_loop.py CHANGED
@@ -20,11 +20,11 @@ from agent.tools.jobs_tool import CPU_FLAVORS
20
  logger = logging.getLogger(__name__)
21
 
22
  ToolCall = ChatCompletionMessageToolCall
23
- # Explicit inference token for LLM API calls (separate from user OAuth tokens).
24
- _INFERENCE_API_KEY = os.environ.get("INFERENCE_TOKEN")
25
 
26
 
27
- def _resolve_hf_router_params(model_name: str) -> dict:
 
 
28
  """
29
  Build LiteLLM kwargs for HuggingFace Router models.
30
 
@@ -35,6 +35,13 @@ def _resolve_hf_router_params(model_name: str) -> dict:
35
 
36
  Input format: huggingface/<router_provider>/<org>/<model>
37
  Example: huggingface/novita/moonshotai/kimi-k2.5
 
 
 
 
 
 
 
38
  """
39
  if not model_name.startswith("huggingface/"):
40
  return {"model": model_name}
@@ -47,7 +54,11 @@ def _resolve_hf_router_params(model_name: str) -> dict:
47
 
48
  router_provider = parts[1]
49
  actual_model = parts[2]
50
- api_key = _INFERENCE_API_KEY
 
 
 
 
51
 
52
  return {
53
  "model": f"openai/{actual_model}",
@@ -205,6 +216,7 @@ async def _compact_and_notify(session: Session) -> None:
205
  await session.context_manager.compact(
206
  model_name=session.config.model_name,
207
  tool_specs=tool_specs,
 
208
  )
209
  new_length = session.context_manager.context_length
210
  if new_length != old_length:
@@ -506,7 +518,9 @@ class Handlers:
506
  tools = session.tool_router.get_tool_specs_for_llm()
507
  try:
508
  # ── Call the LLM (streaming or non-streaming) ──
509
- llm_params = _resolve_hf_router_params(session.config.model_name)
 
 
510
  if session.stream:
511
  llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
512
  else:
 
20
  logger = logging.getLogger(__name__)
21
 
22
  ToolCall = ChatCompletionMessageToolCall
 
 
23
 
24
 
25
+ def _resolve_hf_router_params(
26
+ model_name: str, session_hf_token: str | None = None
27
+ ) -> dict:
28
  """
29
  Build LiteLLM kwargs for HuggingFace Router models.
30
 
 
35
 
36
  Input format: huggingface/<router_provider>/<org>/<model>
37
  Example: huggingface/novita/moonshotai/kimi-k2.5
38
+
39
+ Token resolution (first non-empty wins):
40
+ 1. INFERENCE_TOKEN env β€” shared key on the hosted Space so inference
41
+ is free for users and billed to the Space owner.
42
+ 2. session.hf_token β€” the user's own token (CLI or self-hosted),
43
+ resolved from env / huggingface-cli login / cached token file.
44
+ 3. HF_TOKEN env β€” belt-and-suspenders fallback for CLI users.
45
  """
46
  if not model_name.startswith("huggingface/"):
47
  return {"model": model_name}
 
54
 
55
  router_provider = parts[1]
56
  actual_model = parts[2]
57
+ api_key = (
58
+ os.environ.get("INFERENCE_TOKEN")
59
+ or session_hf_token
60
+ or os.environ.get("HF_TOKEN")
61
+ )
62
 
63
  return {
64
  "model": f"openai/{actual_model}",
 
216
  await session.context_manager.compact(
217
  model_name=session.config.model_name,
218
  tool_specs=tool_specs,
219
+ hf_token=session.hf_token,
220
  )
221
  new_length = session.context_manager.context_length
222
  if new_length != old_length:
 
518
  tools = session.tool_router.get_tool_specs_for_llm()
519
  try:
520
  # ── Call the LLM (streaming or non-streaming) ──
521
+ llm_params = _resolve_hf_router_params(
522
+ session.config.model_name, session.hf_token
523
+ )
524
  if session.stream:
525
  llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
526
  else:
agent/tools/research_tool.py CHANGED
@@ -213,7 +213,9 @@ RESEARCH_TOOL_SPEC = {
213
  }
214
 
215
 
216
- def _resolve_llm_params(model_name: str) -> dict:
 
 
217
  """Build LiteLLM kwargs, reusing the HF router logic from agent_loop."""
218
  if not model_name.startswith("huggingface/"):
219
  return {"model": model_name}
@@ -224,10 +226,16 @@ def _resolve_llm_params(model_name: str) -> dict:
224
 
225
  provider = parts[1]
226
  model_id = parts[2]
 
 
 
 
 
 
227
  return {
228
  "model": f"openai/{model_id}",
229
  "api_base": f"https://router.huggingface.co/{provider}/v3/openai",
230
- "api_key": os.environ.get("INFERENCE_TOKEN", ""),
231
  }
232
 
233
 
@@ -264,7 +272,7 @@ async def research_handler(
264
  # Use a cheaper/faster model for research
265
  main_model = session.config.model_name
266
  research_model = _get_research_model(main_model)
267
- llm_params = _resolve_llm_params(research_model)
268
 
269
  # Get read-only tool specs from the session's tool router
270
  tool_specs = [
 
213
  }
214
 
215
 
216
+ def _resolve_llm_params(
217
+ model_name: str, session_hf_token: str | None = None
218
+ ) -> dict:
219
  """Build LiteLLM kwargs, reusing the HF router logic from agent_loop."""
220
  if not model_name.startswith("huggingface/"):
221
  return {"model": model_name}
 
226
 
227
  provider = parts[1]
228
  model_id = parts[2]
229
+ api_key = (
230
+ os.environ.get("INFERENCE_TOKEN")
231
+ or session_hf_token
232
+ or os.environ.get("HF_TOKEN")
233
+ or ""
234
+ )
235
  return {
236
  "model": f"openai/{model_id}",
237
  "api_base": f"https://router.huggingface.co/{provider}/v3/openai",
238
+ "api_key": api_key,
239
  }
240
 
241
 
 
272
  # Use a cheaper/faster model for research
273
  main_model = session.config.model_name
274
  research_model = _get_research_model(main_model)
275
+ llm_params = _resolve_llm_params(research_model, getattr(session, "hf_token", None))
276
 
277
  # Get read-only tool specs from the session's tool router
278
  tool_specs = [