Spaces:
Running
Running
GitHub Actions commited on
Commit ·
9563e4a
1
Parent(s): 1d47e3c
Deploy 1ba9ba6
Browse files- app/api/chat.py +42 -6
- app/core/quality.py +7 -0
- app/pipeline/nodes/generate.py +70 -2
- tests/test_chat_stream_reliability.py +54 -0
- tests/test_generate_focus_selection.py +37 -0
- tests/test_generate_quality_fallback.py +40 -0
- tests/test_parser_config.py +40 -0
- tests/test_parser_sanitization.py +26 -0
app/api/chat.py
CHANGED
|
@@ -12,6 +12,14 @@ from app.security.jwt_auth import verify_jwt
|
|
| 12 |
|
| 13 |
router = APIRouter()
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Phrases a visitor uses when telling the bot it gave a wrong answer.
|
| 16 |
# Matched on the lowercased raw message before any LLM call — O(1), zero cost.
|
| 17 |
_CRITICISM_SIGNALS: frozenset[str] = frozenset({
|
|
@@ -72,7 +80,6 @@ async def _generate_follow_ups(
|
|
| 72 |
for s in sources[:4]:
|
| 73 |
title = s.title if hasattr(s, "title") else s.get("title", "")
|
| 74 |
src_type = s.source_type if hasattr(s, "source_type") else s.get("source_type", "")
|
| 75 |
-
url = s.url if hasattr(s, "url") else s.get("url", "")
|
| 76 |
if title:
|
| 77 |
source_info.append(f"{title} ({src_type})" if src_type else title)
|
| 78 |
|
|
@@ -196,7 +203,7 @@ async def chat_endpoint(
|
|
| 196 |
# will use it if present; Guard runs first so the latency is masked).
|
| 197 |
if decontext_task is not None:
|
| 198 |
try:
|
| 199 |
-
result = await asyncio.wait_for(decontext_task, timeout=
|
| 200 |
if result and result.strip().lower() != request_data.message.strip().lower():
|
| 201 |
decontextualized_query = result.strip()
|
| 202 |
except Exception:
|
|
@@ -206,7 +213,7 @@ async def chat_endpoint(
|
|
| 206 |
expansion_result: dict | None = None
|
| 207 |
if expansion_task is not None:
|
| 208 |
try:
|
| 209 |
-
expansion_result = await asyncio.wait_for(expansion_task, timeout=
|
| 210 |
except Exception:
|
| 211 |
pass # Expansion is best-effort; retriever falls back to raw query.
|
| 212 |
|
|
@@ -260,14 +267,39 @@ async def chat_endpoint(
|
|
| 260 |
interaction_id = None
|
| 261 |
|
| 262 |
try:
|
|
|
|
|
|
|
|
|
|
| 263 |
# stream_mode=["custom", "updates"] yields (mode, data) tuples:
|
| 264 |
# mode="custom" → data is whatever writer(payload) was called with
|
| 265 |
# mode="updates" → data is {node_name: state_updates_dict}
|
| 266 |
-
|
| 267 |
initial_state,
|
| 268 |
stream_mode=["custom", "updates"],
|
| 269 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
if await request.is_disconnected():
|
|
|
|
|
|
|
| 271 |
break
|
| 272 |
|
| 273 |
if mode == "custom":
|
|
@@ -342,5 +374,9 @@ async def chat_endpoint(
|
|
| 342 |
return StreamingResponse(
|
| 343 |
sse_generator(),
|
| 344 |
media_type="text/event-stream",
|
| 345 |
-
headers={
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
)
|
|
|
|
| 12 |
|
| 13 |
router = APIRouter()
|
| 14 |
|
| 15 |
+
# Keep-alive interval for SSE when upstream nodes are still working.
|
| 16 |
+
# Prevents edge/proxy idle timeouts on long retrieval/generation turns.
|
| 17 |
+
_SSE_HEARTBEAT_SECONDS: float = 10.0
|
| 18 |
+
|
| 19 |
+
# Query pre-processing budgets must stay low to avoid delaying first byte.
|
| 20 |
+
_DECONTEXT_TIMEOUT_SECONDS: float = 0.35
|
| 21 |
+
_EXPANSION_TIMEOUT_SECONDS: float = 0.25
|
| 22 |
+
|
| 23 |
# Phrases a visitor uses when telling the bot it gave a wrong answer.
|
| 24 |
# Matched on the lowercased raw message before any LLM call — O(1), zero cost.
|
| 25 |
_CRITICISM_SIGNALS: frozenset[str] = frozenset({
|
|
|
|
| 80 |
for s in sources[:4]:
|
| 81 |
title = s.title if hasattr(s, "title") else s.get("title", "")
|
| 82 |
src_type = s.source_type if hasattr(s, "source_type") else s.get("source_type", "")
|
|
|
|
| 83 |
if title:
|
| 84 |
source_info.append(f"{title} ({src_type})" if src_type else title)
|
| 85 |
|
|
|
|
| 203 |
# will use it if present; Guard runs first so the latency is masked).
|
| 204 |
if decontext_task is not None:
|
| 205 |
try:
|
| 206 |
+
result = await asyncio.wait_for(decontext_task, timeout=_DECONTEXT_TIMEOUT_SECONDS)
|
| 207 |
if result and result.strip().lower() != request_data.message.strip().lower():
|
| 208 |
decontextualized_query = result.strip()
|
| 209 |
except Exception:
|
|
|
|
| 213 |
expansion_result: dict | None = None
|
| 214 |
if expansion_task is not None:
|
| 215 |
try:
|
| 216 |
+
expansion_result = await asyncio.wait_for(expansion_task, timeout=_EXPANSION_TIMEOUT_SECONDS)
|
| 217 |
except Exception:
|
| 218 |
pass # Expansion is best-effort; retriever falls back to raw query.
|
| 219 |
|
|
|
|
| 267 |
interaction_id = None
|
| 268 |
|
| 269 |
try:
|
| 270 |
+
# Emit an early event so clients/proxies receive first bytes quickly.
|
| 271 |
+
yield f"event: status\ndata: {json.dumps({'label': 'Starting response...'})}\n\n"
|
| 272 |
+
|
| 273 |
# stream_mode=["custom", "updates"] yields (mode, data) tuples:
|
| 274 |
# mode="custom" → data is whatever writer(payload) was called with
|
| 275 |
# mode="updates" → data is {node_name: state_updates_dict}
|
| 276 |
+
stream_iter = pipeline.astream(
|
| 277 |
initial_state,
|
| 278 |
stream_mode=["custom", "updates"],
|
| 279 |
+
).__aiter__()
|
| 280 |
+
next_item_task: asyncio.Task | None = asyncio.create_task(stream_iter.__anext__())
|
| 281 |
+
|
| 282 |
+
while True:
|
| 283 |
+
try:
|
| 284 |
+
mode, data = await asyncio.wait_for(
|
| 285 |
+
asyncio.shield(next_item_task),
|
| 286 |
+
timeout=_SSE_HEARTBEAT_SECONDS,
|
| 287 |
+
)
|
| 288 |
+
except asyncio.TimeoutError:
|
| 289 |
+
if await request.is_disconnected():
|
| 290 |
+
if not next_item_task.done():
|
| 291 |
+
next_item_task.cancel()
|
| 292 |
+
break
|
| 293 |
+
yield f"event: ping\ndata: {json.dumps({'ts': int(time.time())})}\n\n"
|
| 294 |
+
continue
|
| 295 |
+
except StopAsyncIteration:
|
| 296 |
+
break
|
| 297 |
+
|
| 298 |
+
next_item_task = asyncio.create_task(stream_iter.__anext__())
|
| 299 |
+
|
| 300 |
if await request.is_disconnected():
|
| 301 |
+
if not next_item_task.done():
|
| 302 |
+
next_item_task.cancel()
|
| 303 |
break
|
| 304 |
|
| 305 |
if mode == "custom":
|
|
|
|
| 374 |
return StreamingResponse(
|
| 375 |
sse_generator(),
|
| 376 |
media_type="text/event-stream",
|
| 377 |
+
headers={
|
| 378 |
+
"Cache-Control": "no-cache",
|
| 379 |
+
"X-Accel-Buffering": "no",
|
| 380 |
+
"Connection": "keep-alive",
|
| 381 |
+
},
|
| 382 |
)
|
app/core/quality.py
CHANGED
|
@@ -29,8 +29,13 @@ _HEDGE_PHRASES: tuple[str, ...] = (
|
|
| 29 |
"does not provide",
|
| 30 |
"does not offer",
|
| 31 |
"no detailed information",
|
|
|
|
|
|
|
|
|
|
| 32 |
)
|
| 33 |
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
|
| 36 |
"""
|
|
@@ -46,6 +51,8 @@ def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
|
|
| 46 |
lowered = answer.lower()
|
| 47 |
if any(phrase in lowered for phrase in _HEDGE_PHRASES):
|
| 48 |
return True
|
|
|
|
|
|
|
| 49 |
if chunks and not re.search(r"\[\d+\]", answer):
|
| 50 |
return True
|
| 51 |
if complexity == "complex" and len(answer.split()) < 30:
|
|
|
|
| 29 |
"does not provide",
|
| 30 |
"does not offer",
|
| 31 |
"no detailed information",
|
| 32 |
+
"not explicitly state",
|
| 33 |
+
"not explicitly stated",
|
| 34 |
+
"cannot be verified",
|
| 35 |
)
|
| 36 |
|
| 37 |
+
_RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
|
| 38 |
+
|
| 39 |
|
| 40 |
def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
|
| 41 |
"""
|
|
|
|
| 51 |
lowered = answer.lower()
|
| 52 |
if any(phrase in lowered for phrase in _HEDGE_PHRASES):
|
| 53 |
return True
|
| 54 |
+
if _RAW_TAG_RE.search(answer):
|
| 55 |
+
return True
|
| 56 |
if chunks and not re.search(r"\[\d+\]", answer):
|
| 57 |
return True
|
| 58 |
if complexity == "complex" and len(answer.split()) < 30:
|
app/pipeline/nodes/generate.py
CHANGED
|
@@ -17,6 +17,9 @@ logger = logging.getLogger(__name__)
|
|
| 17 |
_THINK_COMPLETE_RE = re.compile(r"<think>[\s\S]*?</think>", re.DOTALL)
|
| 18 |
_THINK_OPEN_RE = re.compile(r"<think>")
|
| 19 |
_THINK_CLOSE_RE = re.compile(r"</think>")
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Chars to buffer at Phase-2 chunk boundaries to prevent split closing tags
|
| 22 |
# (e.g., one SSE chunk ends with "</thi", next starts with "nk>") from being
|
|
@@ -73,6 +76,10 @@ ANSWERING RULES — follow all of them every time:
|
|
| 73 |
those facts — a short confident answer beats a padded hallucinated one.
|
| 74 |
7. Vary your sentence openers. Never start two consecutive sentences with "Darshan".
|
| 75 |
8. Length: 2–4 paragraphs for detailed topics; 1 paragraph for simple factual questions.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
RELEVANCE CHECK — do this BEFORE writing:
|
| 78 |
- Examine EVERY passage, not just the first one. The most relevant passage may not be [1].
|
|
@@ -234,13 +241,67 @@ def _normalise_answer_text(answer: str, max_citation_index: int) -> str:
|
|
| 234 |
idx = int(match.group(1))
|
| 235 |
return f"[{idx}]" if 1 <= idx <= max_citation_index else ""
|
| 236 |
|
| 237 |
-
cleaned =
|
|
|
|
| 238 |
cleaned = re.sub(r"(\[\d+\])(\1)+", r"\1", cleaned)
|
| 239 |
cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
|
| 240 |
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
| 241 |
return cleaned.strip()
|
| 242 |
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]: # noqa: ANN001
|
| 245 |
# Number of token chunks to buffer before deciding there is no CoT block.
|
| 246 |
# Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
|
|
@@ -310,7 +371,8 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
|
|
| 310 |
# TextOps become [1] and [2] — the LLM cites both in the same sentence,
|
| 311 |
# which looks like self-citing hallucination even though it is technically
|
| 312 |
# correct. _merge_by_source preserves all text; nothing is discarded.
|
| 313 |
-
|
|
|
|
| 314 |
context_parts: list[str] = []
|
| 315 |
source_refs: list[SourceRef] = []
|
| 316 |
|
|
@@ -457,6 +519,12 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
|
|
| 457 |
|
| 458 |
full_answer = _normalise_answer_text(full_answer, max_citation_index=len(source_refs))
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
# Only surface sources the LLM actually cited, deduplicated by URL so
|
| 461 |
# multiple chunks from the same document show as one source card.
|
| 462 |
cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}
|
|
|
|
| 17 |
_THINK_COMPLETE_RE = re.compile(r"<think>[\s\S]*?</think>", re.DOTALL)
|
| 18 |
_THINK_OPEN_RE = re.compile(r"<think>")
|
| 19 |
_THINK_CLOSE_RE = re.compile(r"</think>")
|
| 20 |
+
_GEN_HTML_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
|
| 21 |
+
_VERSION_PARITY_RE = re.compile(r"\b(up[- ]?to[- ]?date|latest|current|in sync|same version|version)\b", re.IGNORECASE)
|
| 22 |
+
_WORD_RE = re.compile(r"[a-zA-Z0-9]+")
|
| 23 |
|
| 24 |
# Chars to buffer at Phase-2 chunk boundaries to prevent split closing tags
|
| 25 |
# (e.g., one SSE chunk ends with "</thi", next starts with "nk>") from being
|
|
|
|
| 76 |
those facts — a short confident answer beats a padded hallucinated one.
|
| 77 |
7. Vary your sentence openers. Never start two consecutive sentences with "Darshan".
|
| 78 |
8. Length: 2–4 paragraphs for detailed topics; 1 paragraph for simple factual questions.
|
| 79 |
+
9. If asked about freshness/version parity (e.g., "up-to-date", "same as demo"), and passages
|
| 80 |
+
do not explicitly confirm it, answer in at most 2 sentences: state what is known from passages,
|
| 81 |
+
then explicitly say it cannot be verified from indexed sources.
|
| 82 |
+
10. Do not list unrelated projects or sources unless the user asked for a list/compare.
|
| 83 |
|
| 84 |
RELEVANCE CHECK — do this BEFORE writing:
|
| 85 |
- Examine EVERY passage, not just the first one. The most relevant passage may not be [1].
|
|
|
|
| 241 |
idx = int(match.group(1))
|
| 242 |
return f"[{idx}]" if 1 <= idx <= max_citation_index else ""
|
| 243 |
|
| 244 |
+
cleaned = _GEN_HTML_TAG_RE.sub("", answer)
|
| 245 |
+
cleaned = re.sub(r"\[(\d+)\]", _keep_valid_citation, cleaned)
|
| 246 |
cleaned = re.sub(r"(\[\d+\])(\1)+", r"\1", cleaned)
|
| 247 |
cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
|
| 248 |
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
| 249 |
return cleaned.strip()
|
| 250 |
|
| 251 |
|
| 252 |
+
def _build_low_trust_fallback(query: str, source_refs: list[SourceRef]) -> str:
|
| 253 |
+
"""Deterministic concise fallback when model output still fails trust checks."""
|
| 254 |
+
if not source_refs:
|
| 255 |
+
return _NOT_FOUND_ANSWER
|
| 256 |
+
|
| 257 |
+
first = source_refs[0]
|
| 258 |
+
title = first.title or "the retrieved source"
|
| 259 |
+
|
| 260 |
+
if _VERSION_PARITY_RE.search(query):
|
| 261 |
+
return (
|
| 262 |
+
f"The retrieved sources confirm links/details for {title} [1], but they do not explicitly "
|
| 263 |
+
"confirm whether the GitHub code and live demo are currently in sync, so version parity "
|
| 264 |
+
"cannot be verified from the indexed content alone [1]."
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
return (
|
| 268 |
+
f"Based on the retrieved evidence, the answer is grounded in {title} [1]. "
|
| 269 |
+
"If you want deeper detail, ask for a specific section, implementation part, or comparison."
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _select_chunks_for_prompt(query: str, reranked_chunks: list[dict]) -> list[dict]:
|
| 274 |
+
"""
|
| 275 |
+
Prefer chunks whose source title is explicitly referenced in the query.
|
| 276 |
+
|
| 277 |
+
This prevents focused questions (e.g. one project) from receiving multi-project
|
| 278 |
+
blended context that can trigger verbose, low-quality comparison answers.
|
| 279 |
+
"""
|
| 280 |
+
if not reranked_chunks:
|
| 281 |
+
return reranked_chunks
|
| 282 |
+
|
| 283 |
+
query_lower = query.lower()
|
| 284 |
+
focused: list[dict] = []
|
| 285 |
+
|
| 286 |
+
for chunk in reranked_chunks:
|
| 287 |
+
title = str(chunk["metadata"].get("source_title", "")).strip()
|
| 288 |
+
if not title:
|
| 289 |
+
continue
|
| 290 |
+
title_lower = title.lower()
|
| 291 |
+
if len(title_lower) >= 4 and title_lower in query_lower:
|
| 292 |
+
focused.append(chunk)
|
| 293 |
+
continue
|
| 294 |
+
|
| 295 |
+
title_tokens = [t for t in _WORD_RE.findall(title_lower) if len(t) >= 4]
|
| 296 |
+
if title_tokens and sum(1 for tok in title_tokens if tok in query_lower) >= min(2, len(title_tokens)):
|
| 297 |
+
focused.append(chunk)
|
| 298 |
+
|
| 299 |
+
if focused:
|
| 300 |
+
return focused[:6]
|
| 301 |
+
|
| 302 |
+
return reranked_chunks[:8]
|
| 303 |
+
|
| 304 |
+
|
| 305 |
def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]: # noqa: ANN001
|
| 306 |
# Number of token chunks to buffer before deciding there is no CoT block.
|
| 307 |
# Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
|
|
|
|
| 371 |
# TextOps become [1] and [2] — the LLM cites both in the same sentence,
|
| 372 |
# which looks like self-citing hallucination even though it is technically
|
| 373 |
# correct. _merge_by_source preserves all text; nothing is discarded.
|
| 374 |
+
selected_chunks = _select_chunks_for_prompt(query, reranked_chunks)
|
| 375 |
+
merged_chunks = _merge_by_source(selected_chunks)
|
| 376 |
context_parts: list[str] = []
|
| 377 |
source_refs: list[SourceRef] = []
|
| 378 |
|
|
|
|
| 519 |
|
| 520 |
full_answer = _normalise_answer_text(full_answer, max_citation_index=len(source_refs))
|
| 521 |
|
| 522 |
+
# Final guardrail: if answer still looks low-trust after reformat + cleanup,
|
| 523 |
+
# return a concise deterministic fallback anchored to retrieved sources.
|
| 524 |
+
if is_low_trust(full_answer, reranked_chunks, complexity):
|
| 525 |
+
logger.debug("Final low-trust guard triggered; using deterministic fallback.")
|
| 526 |
+
full_answer = _build_low_trust_fallback(query, source_refs)
|
| 527 |
+
|
| 528 |
# Only surface sources the LLM actually cited, deduplicated by URL so
|
| 529 |
# multiple chunks from the same document show as one source card.
|
| 530 |
cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}
|
tests/test_chat_stream_reliability.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
from unittest.mock import MagicMock, patch
|
| 5 |
+
|
| 6 |
+
from fastapi.testclient import TestClient
|
| 7 |
+
from jose import jwt
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _make_token() -> str:
|
| 11 |
+
payload = {"sub": "test-user", "exp": int(time.time()) + 3600}
|
| 12 |
+
return jwt.encode(payload, "test-secret-32-chars-long-0000000", algorithm="HS256")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _chat(client: TestClient, message: str) -> str:
|
| 16 |
+
token = _make_token()
|
| 17 |
+
response = client.post(
|
| 18 |
+
"/chat",
|
| 19 |
+
json={"message": message, "session_id": "a1b2c3d4-e5f6-4789-8abc-def012345678"},
|
| 20 |
+
headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
|
| 21 |
+
)
|
| 22 |
+
assert response.status_code == 200
|
| 23 |
+
return response.text
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_stream_emits_early_status_and_heartbeat(monkeypatch):
|
| 27 |
+
# Reduce heartbeat interval so the test can verify keepalive quickly.
|
| 28 |
+
monkeypatch.setattr("app.api.chat._SSE_HEARTBEAT_SECONDS", 0.05)
|
| 29 |
+
|
| 30 |
+
mock_pipeline = MagicMock()
|
| 31 |
+
|
| 32 |
+
async def delayed_astream(state, stream_mode=None):
|
| 33 |
+
await asyncio.sleep(0.12)
|
| 34 |
+
yield ("custom", {"type": "status", "label": "Thinking..."})
|
| 35 |
+
yield ("custom", {"type": "token", "text": "Answer text."})
|
| 36 |
+
yield ("updates", {"generate": {"answer": "Answer text.", "sources": []}})
|
| 37 |
+
|
| 38 |
+
mock_pipeline.astream = delayed_astream
|
| 39 |
+
|
| 40 |
+
with patch("app.main.build_pipeline", return_value=mock_pipeline), \
|
| 41 |
+
patch("app.main.QdrantClient"), \
|
| 42 |
+
patch("app.services.embedder.Embedder"), \
|
| 43 |
+
patch("app.services.reranker.Reranker"):
|
| 44 |
+
from app.main import create_app
|
| 45 |
+
|
| 46 |
+
app = create_app()
|
| 47 |
+
app.state.pipeline = mock_pipeline
|
| 48 |
+
with TestClient(app, raise_server_exceptions=True) as client:
|
| 49 |
+
body = _chat(client, "Tell me about TextOps")
|
| 50 |
+
|
| 51 |
+
assert "event: status" in body
|
| 52 |
+
assert "Starting response..." in body
|
| 53 |
+
assert "event: ping" in body
|
| 54 |
+
assert "Answer text." in body
|
tests/test_generate_focus_selection.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.pipeline.nodes.generate import _select_chunks_for_prompt
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def _chunk(title: str, section: str = "Overview") -> dict:
|
| 5 |
+
return {
|
| 6 |
+
"text": f"Info about {title}",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"source_title": title,
|
| 9 |
+
"section": section,
|
| 10 |
+
"source_url": "",
|
| 11 |
+
"source_type": "project",
|
| 12 |
+
},
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_select_chunks_prefers_explicitly_mentioned_source_title() -> None:
|
| 17 |
+
chunks = [
|
| 18 |
+
_chunk("Sorting Demo"),
|
| 19 |
+
_chunk("EchoEcho"),
|
| 20 |
+
_chunk("Donut.asm"),
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
selected = _select_chunks_for_prompt(
|
| 24 |
+
"Is the source code on GitHub up-to-date with the Sorting Demo live demo?",
|
| 25 |
+
chunks,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
assert selected
|
| 29 |
+
assert all(c["metadata"]["source_title"] == "Sorting Demo" for c in selected)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_select_chunks_falls_back_to_top_ranked_when_no_title_match() -> None:
|
| 33 |
+
chunks = [_chunk("A"), _chunk("B"), _chunk("C")]
|
| 34 |
+
|
| 35 |
+
selected = _select_chunks_for_prompt("What technologies are used?", chunks)
|
| 36 |
+
|
| 37 |
+
assert selected == chunks
|
tests/test_generate_quality_fallback.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.models.chat import SourceRef
|
| 2 |
+
from app.pipeline.nodes.generate import _build_low_trust_fallback
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def test_low_trust_fallback_for_version_parity_queries() -> None:
|
| 6 |
+
sources = [
|
| 7 |
+
SourceRef(
|
| 8 |
+
title="Sorting Demo",
|
| 9 |
+
url="https://github.com/1337Xcode/sortingdemo",
|
| 10 |
+
section="Overview",
|
| 11 |
+
source_type="project",
|
| 12 |
+
)
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
answer = _build_low_trust_fallback(
|
| 16 |
+
"Is the source code up-to-date with the online demo version?",
|
| 17 |
+
sources,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
assert "cannot be verified" in answer
|
| 21 |
+
assert "[1]" in answer
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_low_trust_fallback_general_query_is_concise() -> None:
|
| 25 |
+
sources = [
|
| 26 |
+
SourceRef(
|
| 27 |
+
title="Sorting Demo",
|
| 28 |
+
url="https://github.com/1337Xcode/sortingdemo",
|
| 29 |
+
section="Overview",
|
| 30 |
+
source_type="project",
|
| 31 |
+
)
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
answer = _build_low_trust_fallback(
|
| 35 |
+
"What technology is used to build Sorting Demo?",
|
| 36 |
+
sources,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
assert "Sorting Demo" in answer
|
| 40 |
+
assert "[1]" in answer
|
tests/test_parser_config.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
ROOT = Path(__file__).resolve().parents[2]
|
| 6 |
+
if str(ROOT) not in sys.path:
|
| 7 |
+
sys.path.insert(0, str(ROOT))
|
| 8 |
+
|
| 9 |
+
from ingestion.parser_config import load_parser_config
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_parser_config_defaults_all_enabled(monkeypatch) -> None:
|
| 13 |
+
for key in (
|
| 14 |
+
"INGEST_ENABLE_BLOG_MDX",
|
| 15 |
+
"INGEST_ENABLE_PROJECT_MDX",
|
| 16 |
+
"INGEST_ENABLE_PDF",
|
| 17 |
+
"INGEST_ENABLE_GITHUB_README",
|
| 18 |
+
):
|
| 19 |
+
monkeypatch.delenv(key, raising=False)
|
| 20 |
+
|
| 21 |
+
cfg = load_parser_config()
|
| 22 |
+
|
| 23 |
+
assert cfg.enable_blog_mdx is True
|
| 24 |
+
assert cfg.enable_project_mdx is True
|
| 25 |
+
assert cfg.enable_pdf is True
|
| 26 |
+
assert cfg.enable_github_readme is True
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_parser_config_can_disable_selective_parsers(monkeypatch) -> None:
|
| 30 |
+
monkeypatch.setenv("INGEST_ENABLE_BLOG_MDX", "false")
|
| 31 |
+
monkeypatch.setenv("INGEST_ENABLE_PROJECT_MDX", "true")
|
| 32 |
+
monkeypatch.setenv("INGEST_ENABLE_PDF", "0")
|
| 33 |
+
monkeypatch.setenv("INGEST_ENABLE_GITHUB_README", "yes")
|
| 34 |
+
|
| 35 |
+
cfg = load_parser_config()
|
| 36 |
+
|
| 37 |
+
assert cfg.enable_blog_mdx is False
|
| 38 |
+
assert cfg.enable_project_mdx is True
|
| 39 |
+
assert cfg.enable_pdf is False
|
| 40 |
+
assert cfg.enable_github_readme is True
|
tests/test_parser_sanitization.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
ROOT = Path(__file__).resolve().parents[2]
|
| 5 |
+
if str(ROOT) not in sys.path:
|
| 6 |
+
sys.path.insert(0, str(ROOT))
|
| 7 |
+
|
| 8 |
+
from ingestion.parsers.readme_parser import parse_readme_bytes
|
| 9 |
+
from ingestion.parsers.text_sanitizer import strip_html_tags
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_strip_html_tags_removes_img_and_comments() -> None:
|
| 13 |
+
text = "Hello <!-- comment --> <img src='x'> world <b>bold</b>"
|
| 14 |
+
cleaned = strip_html_tags(text)
|
| 15 |
+
assert "<img" not in cleaned
|
| 16 |
+
assert "<!--" not in cleaned
|
| 17 |
+
assert "<b>" not in cleaned
|
| 18 |
+
assert "Hello" in cleaned and "world" in cleaned
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_parse_readme_bytes_removes_raw_html() -> None:
|
| 22 |
+
readme = b"# Repo\n\n<img src='banner.png'/>\n\nSome content"
|
| 23 |
+
parsed = parse_readme_bytes(readme, repo_name="1337Xcode/demo")
|
| 24 |
+
assert "<img" not in parsed["clean_content"]
|
| 25 |
+
assert "Some content" in parsed["clean_content"]
|
| 26 |
+
|