Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Aksel Joonas Reedi Claude Opus 4.7 (1M context) commited on
Fall back to user HF token for router when INFERENCE_TOKEN is unset (#39)
Browse filesThe HF router code path only read INFERENCE_TOKEN, which is the shared
server-side key set on the hosted Space so inference is free for users.
On the CLI / self-hosted path that env var is absent, so requests went
out with no bearer token and the router returned 401 β surfaced to users
as "Authentication failed" even with a valid HF_TOKEN (issue #36).
Resolve api_key in this order:
1. INFERENCE_TOKEN env (unchanged Space behavior β shared billing)
2. session.hf_token (user's OAuth / CLI token)
3. HF_TOKEN env (belt-and-suspenders for CLI)
Applied to _resolve_hf_router_params, research_tool._resolve_llm_params,
and ContextManager.compact.
Fixes #36
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- agent/context_manager/manager.py +9 -2
- agent/core/agent_loop.py +19 -5
- agent/tools/research_tool.py +11 -3
agent/context_manager/manager.py
CHANGED
|
@@ -263,7 +263,10 @@ class ContextManager:
|
|
| 263 |
return False
|
| 264 |
|
| 265 |
async def compact(
|
| 266 |
-
self,
|
|
|
|
|
|
|
|
|
|
| 267 |
) -> None:
|
| 268 |
"""Remove old messages to keep history under target size"""
|
| 269 |
if (self.context_length <= self.max_context) or not self.items:
|
|
@@ -303,7 +306,11 @@ class ContextManager:
|
|
| 303 |
)
|
| 304 |
)
|
| 305 |
|
| 306 |
-
hf_key =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
response = await acompletion(
|
| 308 |
model=model_name,
|
| 309 |
messages=messages_to_summarize,
|
|
|
|
| 263 |
return False
|
| 264 |
|
| 265 |
async def compact(
|
| 266 |
+
self,
|
| 267 |
+
model_name: str,
|
| 268 |
+
tool_specs: list[dict] | None = None,
|
| 269 |
+
hf_token: str | None = None,
|
| 270 |
) -> None:
|
| 271 |
"""Remove old messages to keep history under target size"""
|
| 272 |
if (self.context_length <= self.max_context) or not self.items:
|
|
|
|
| 306 |
)
|
| 307 |
)
|
| 308 |
|
| 309 |
+
hf_key = (
|
| 310 |
+
os.environ.get("INFERENCE_TOKEN")
|
| 311 |
+
or hf_token
|
| 312 |
+
or os.environ.get("HF_TOKEN")
|
| 313 |
+
)
|
| 314 |
response = await acompletion(
|
| 315 |
model=model_name,
|
| 316 |
messages=messages_to_summarize,
|
agent/core/agent_loop.py
CHANGED
|
@@ -20,11 +20,11 @@ from agent.tools.jobs_tool import CPU_FLAVORS
|
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
| 22 |
ToolCall = ChatCompletionMessageToolCall
|
| 23 |
-
# Explicit inference token for LLM API calls (separate from user OAuth tokens).
|
| 24 |
-
_INFERENCE_API_KEY = os.environ.get("INFERENCE_TOKEN")
|
| 25 |
|
| 26 |
|
| 27 |
-
def _resolve_hf_router_params(
|
|
|
|
|
|
|
| 28 |
"""
|
| 29 |
Build LiteLLM kwargs for HuggingFace Router models.
|
| 30 |
|
|
@@ -35,6 +35,13 @@ def _resolve_hf_router_params(model_name: str) -> dict:
|
|
| 35 |
|
| 36 |
Input format: huggingface/<router_provider>/<org>/<model>
|
| 37 |
Example: huggingface/novita/moonshotai/kimi-k2.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
if not model_name.startswith("huggingface/"):
|
| 40 |
return {"model": model_name}
|
|
@@ -47,7 +54,11 @@ def _resolve_hf_router_params(model_name: str) -> dict:
|
|
| 47 |
|
| 48 |
router_provider = parts[1]
|
| 49 |
actual_model = parts[2]
|
| 50 |
-
api_key =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
return {
|
| 53 |
"model": f"openai/{actual_model}",
|
|
@@ -205,6 +216,7 @@ async def _compact_and_notify(session: Session) -> None:
|
|
| 205 |
await session.context_manager.compact(
|
| 206 |
model_name=session.config.model_name,
|
| 207 |
tool_specs=tool_specs,
|
|
|
|
| 208 |
)
|
| 209 |
new_length = session.context_manager.context_length
|
| 210 |
if new_length != old_length:
|
|
@@ -506,7 +518,9 @@ class Handlers:
|
|
| 506 |
tools = session.tool_router.get_tool_specs_for_llm()
|
| 507 |
try:
|
| 508 |
# ββ Call the LLM (streaming or non-streaming) ββ
|
| 509 |
-
llm_params = _resolve_hf_router_params(
|
|
|
|
|
|
|
| 510 |
if session.stream:
|
| 511 |
llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
|
| 512 |
else:
|
|
|
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
| 22 |
ToolCall = ChatCompletionMessageToolCall
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
+
def _resolve_hf_router_params(
|
| 26 |
+
model_name: str, session_hf_token: str | None = None
|
| 27 |
+
) -> dict:
|
| 28 |
"""
|
| 29 |
Build LiteLLM kwargs for HuggingFace Router models.
|
| 30 |
|
|
|
|
| 35 |
|
| 36 |
Input format: huggingface/<router_provider>/<org>/<model>
|
| 37 |
Example: huggingface/novita/moonshotai/kimi-k2.5
|
| 38 |
+
|
| 39 |
+
Token resolution (first non-empty wins):
|
| 40 |
+
1. INFERENCE_TOKEN env β shared key on the hosted Space so inference
|
| 41 |
+
is free for users and billed to the Space owner.
|
| 42 |
+
2. session.hf_token β the user's own token (CLI or self-hosted),
|
| 43 |
+
resolved from env / huggingface-cli login / cached token file.
|
| 44 |
+
3. HF_TOKEN env β belt-and-suspenders fallback for CLI users.
|
| 45 |
"""
|
| 46 |
if not model_name.startswith("huggingface/"):
|
| 47 |
return {"model": model_name}
|
|
|
|
| 54 |
|
| 55 |
router_provider = parts[1]
|
| 56 |
actual_model = parts[2]
|
| 57 |
+
api_key = (
|
| 58 |
+
os.environ.get("INFERENCE_TOKEN")
|
| 59 |
+
or session_hf_token
|
| 60 |
+
or os.environ.get("HF_TOKEN")
|
| 61 |
+
)
|
| 62 |
|
| 63 |
return {
|
| 64 |
"model": f"openai/{actual_model}",
|
|
|
|
| 216 |
await session.context_manager.compact(
|
| 217 |
model_name=session.config.model_name,
|
| 218 |
tool_specs=tool_specs,
|
| 219 |
+
hf_token=session.hf_token,
|
| 220 |
)
|
| 221 |
new_length = session.context_manager.context_length
|
| 222 |
if new_length != old_length:
|
|
|
|
| 518 |
tools = session.tool_router.get_tool_specs_for_llm()
|
| 519 |
try:
|
| 520 |
# ββ Call the LLM (streaming or non-streaming) ββ
|
| 521 |
+
llm_params = _resolve_hf_router_params(
|
| 522 |
+
session.config.model_name, session.hf_token
|
| 523 |
+
)
|
| 524 |
if session.stream:
|
| 525 |
llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
|
| 526 |
else:
|
agent/tools/research_tool.py
CHANGED
|
@@ -213,7 +213,9 @@ RESEARCH_TOOL_SPEC = {
|
|
| 213 |
}
|
| 214 |
|
| 215 |
|
| 216 |
-
def _resolve_llm_params(
|
|
|
|
|
|
|
| 217 |
"""Build LiteLLM kwargs, reusing the HF router logic from agent_loop."""
|
| 218 |
if not model_name.startswith("huggingface/"):
|
| 219 |
return {"model": model_name}
|
|
@@ -224,10 +226,16 @@ def _resolve_llm_params(model_name: str) -> dict:
|
|
| 224 |
|
| 225 |
provider = parts[1]
|
| 226 |
model_id = parts[2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
return {
|
| 228 |
"model": f"openai/{model_id}",
|
| 229 |
"api_base": f"https://router.huggingface.co/{provider}/v3/openai",
|
| 230 |
-
"api_key":
|
| 231 |
}
|
| 232 |
|
| 233 |
|
|
@@ -264,7 +272,7 @@ async def research_handler(
|
|
| 264 |
# Use a cheaper/faster model for research
|
| 265 |
main_model = session.config.model_name
|
| 266 |
research_model = _get_research_model(main_model)
|
| 267 |
-
llm_params = _resolve_llm_params(research_model)
|
| 268 |
|
| 269 |
# Get read-only tool specs from the session's tool router
|
| 270 |
tool_specs = [
|
|
|
|
| 213 |
}
|
| 214 |
|
| 215 |
|
| 216 |
+
def _resolve_llm_params(
|
| 217 |
+
model_name: str, session_hf_token: str | None = None
|
| 218 |
+
) -> dict:
|
| 219 |
"""Build LiteLLM kwargs, reusing the HF router logic from agent_loop."""
|
| 220 |
if not model_name.startswith("huggingface/"):
|
| 221 |
return {"model": model_name}
|
|
|
|
| 226 |
|
| 227 |
provider = parts[1]
|
| 228 |
model_id = parts[2]
|
| 229 |
+
api_key = (
|
| 230 |
+
os.environ.get("INFERENCE_TOKEN")
|
| 231 |
+
or session_hf_token
|
| 232 |
+
or os.environ.get("HF_TOKEN")
|
| 233 |
+
or ""
|
| 234 |
+
)
|
| 235 |
return {
|
| 236 |
"model": f"openai/{model_id}",
|
| 237 |
"api_base": f"https://router.huggingface.co/{provider}/v3/openai",
|
| 238 |
+
"api_key": api_key,
|
| 239 |
}
|
| 240 |
|
| 241 |
|
|
|
|
| 272 |
# Use a cheaper/faster model for research
|
| 273 |
main_model = session.config.model_name
|
| 274 |
research_model = _get_research_model(main_model)
|
| 275 |
+
llm_params = _resolve_llm_params(research_model, getattr(session, "hf_token", None))
|
| 276 |
|
| 277 |
# Get read-only tool specs from the session's tool router
|
| 278 |
tool_specs = [
|