Aksel Joonas Reedi commited on
Commit
f30ed48
·
unverified ·
1 Parent(s): 849c88c

Title gen: swap Llama-3.1-8B for gpt-oss-120b via Cerebras (#58)

Browse files
Files changed (1) hide show
  1. backend/routes/agent.py +8 -5
backend/routes/agent.py CHANGED
@@ -166,9 +166,11 @@ async def generate_title(
166
  ) -> dict:
167
  """Generate a short title for a chat session based on the first user message.
168
 
169
- Always uses Llama-3.1-8B-Instruct via Cerebras on the HF router. The tab
170
- headline renders as plain text, so the model is told to avoid markdown
171
- and any stray formatting characters are stripped before returning.
 
 
172
  """
173
  api_key = (
174
  os.environ.get("INFERENCE_TOKEN")
@@ -179,7 +181,7 @@ async def generate_title(
179
  response = await acompletion(
180
  # Double openai/ prefix: LiteLLM strips the first as its provider
181
  # prefix, leaving the HF model id on the wire for the router.
182
- model="openai/meta-llama/Llama-3.1-8B-Instruct:cerebras",
183
  api_base="https://router.huggingface.co/v1",
184
  api_key=api_key,
185
  messages=[
@@ -195,9 +197,10 @@ async def generate_title(
195
  },
196
  {"role": "user", "content": request.text[:500]},
197
  ],
198
- max_tokens=20,
199
  temperature=0.3,
200
  timeout=10,
 
201
  )
202
  title = response.choices[0].message.content.strip().strip('"').strip("'")
203
  title = title.translate(_TITLE_STRIP_CHARS).strip()
 
166
  ) -> dict:
167
  """Generate a short title for a chat session based on the first user message.
168
 
169
+ Always uses gpt-oss-120b via Cerebras on the HF router. The tab headline
170
+ renders as plain text, so the model is told to avoid markdown and any
171
+ stray formatting characters are stripped before returning. gpt-oss is a
172
+ reasoning model — reasoning_effort=low keeps the reasoning budget small
173
+ so the 60-token output budget isn't consumed before the title is written.
174
  """
175
  api_key = (
176
  os.environ.get("INFERENCE_TOKEN")
 
181
  response = await acompletion(
182
  # Double openai/ prefix: LiteLLM strips the first as its provider
183
  # prefix, leaving the HF model id on the wire for the router.
184
+ model="openai/openai/gpt-oss-120b:cerebras",
185
  api_base="https://router.huggingface.co/v1",
186
  api_key=api_key,
187
  messages=[
 
197
  },
198
  {"role": "user", "content": request.text[:500]},
199
  ],
200
+ max_tokens=60,
201
  temperature=0.3,
202
  timeout=10,
203
+ reasoning_effort="low",
204
  )
205
  title = response.choices[0].message.content.strip().strip('"').strip("'")
206
  title = title.translate(_TITLE_STRIP_CHARS).strip()