Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on 29 days ago

Commit

9563e4a

1 Parent(s): 1d47e3c

Deploy 1ba9ba6

Browse files

Files changed (8) hide show

app/api/chat.py +42 -6
app/core/quality.py +7 -0
app/pipeline/nodes/generate.py +70 -2
tests/test_chat_stream_reliability.py +54 -0
tests/test_generate_focus_selection.py +37 -0
tests/test_generate_quality_fallback.py +40 -0
tests/test_parser_config.py +40 -0
tests/test_parser_sanitization.py +26 -0

app/api/chat.py CHANGED Viewed

@@ -12,6 +12,14 @@ from app.security.jwt_auth import verify_jwt
 router = APIRouter()
 # Phrases a visitor uses when telling the bot it gave a wrong answer.
 # Matched on the lowercased raw message before any LLM call — O(1), zero cost.
 _CRITICISM_SIGNALS: frozenset[str] = frozenset({
@@ -72,7 +80,6 @@ async def _generate_follow_ups(
     for s in sources[:4]:
         title = s.title if hasattr(s, "title") else s.get("title", "")
         src_type = s.source_type if hasattr(s, "source_type") else s.get("source_type", "")
-        url = s.url if hasattr(s, "url") else s.get("url", "")
         if title:
             source_info.append(f"{title} ({src_type})" if src_type else title)
@@ -196,7 +203,7 @@ async def chat_endpoint(
     # will use it if present; Guard runs first so the latency is masked).
     if decontext_task is not None:
         try:
-            result = await asyncio.wait_for(decontext_task, timeout=3.0)
             if result and result.strip().lower() != request_data.message.strip().lower():
                 decontextualized_query = result.strip()
         except Exception:
@@ -206,7 +213,7 @@ async def chat_endpoint(
     expansion_result: dict | None = None
     if expansion_task is not None:
         try:
-            expansion_result = await asyncio.wait_for(expansion_task, timeout=0.8)
         except Exception:
             pass  # Expansion is best-effort; retriever falls back to raw query.
@@ -260,14 +267,39 @@ async def chat_endpoint(
         interaction_id = None
         try:
             # stream_mode=["custom", "updates"] yields (mode, data) tuples:
             #   mode="custom"  → data is whatever writer(payload) was called with
             #   mode="updates" → data is {node_name: state_updates_dict}
-            async for mode, data in pipeline.astream(
                 initial_state,
                 stream_mode=["custom", "updates"],
-            ):
                 if await request.is_disconnected():
                     break
                 if mode == "custom":
@@ -342,5 +374,9 @@ async def chat_endpoint(
     return StreamingResponse(
         sse_generator(),
         media_type="text/event-stream",
-        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
     )

 router = APIRouter()
+# Keep-alive interval for SSE when upstream nodes are still working.
+# Prevents edge/proxy idle timeouts on long retrieval/generation turns.
+_SSE_HEARTBEAT_SECONDS: float = 10.0
+# Query pre-processing budgets must stay low to avoid delaying first byte.
+_DECONTEXT_TIMEOUT_SECONDS: float = 0.35
+_EXPANSION_TIMEOUT_SECONDS: float = 0.25
 # Phrases a visitor uses when telling the bot it gave a wrong answer.
 # Matched on the lowercased raw message before any LLM call — O(1), zero cost.
 _CRITICISM_SIGNALS: frozenset[str] = frozenset({
     for s in sources[:4]:
         title = s.title if hasattr(s, "title") else s.get("title", "")
         src_type = s.source_type if hasattr(s, "source_type") else s.get("source_type", "")
         if title:
             source_info.append(f"{title} ({src_type})" if src_type else title)
     # will use it if present; Guard runs first so the latency is masked).
     if decontext_task is not None:
         try:
+            result = await asyncio.wait_for(decontext_task, timeout=_DECONTEXT_TIMEOUT_SECONDS)
             if result and result.strip().lower() != request_data.message.strip().lower():
                 decontextualized_query = result.strip()
         except Exception:
     expansion_result: dict | None = None
     if expansion_task is not None:
         try:
+            expansion_result = await asyncio.wait_for(expansion_task, timeout=_EXPANSION_TIMEOUT_SECONDS)
         except Exception:
             pass  # Expansion is best-effort; retriever falls back to raw query.
         interaction_id = None
         try:
+            # Emit an early event so clients/proxies receive first bytes quickly.
+            yield f"event: status\ndata: {json.dumps({'label': 'Starting response...'})}\n\n"
             # stream_mode=["custom", "updates"] yields (mode, data) tuples:
             #   mode="custom"  → data is whatever writer(payload) was called with
             #   mode="updates" → data is {node_name: state_updates_dict}
+            stream_iter = pipeline.astream(
                 initial_state,
                 stream_mode=["custom", "updates"],
+            ).__aiter__()
+            next_item_task: asyncio.Task | None = asyncio.create_task(stream_iter.__anext__())
+            while True:
+                try:
+                    mode, data = await asyncio.wait_for(
+                        asyncio.shield(next_item_task),
+                        timeout=_SSE_HEARTBEAT_SECONDS,
+                    )
+                except asyncio.TimeoutError:
+                    if await request.is_disconnected():
+                        if not next_item_task.done():
+                            next_item_task.cancel()
+                        break
+                    yield f"event: ping\ndata: {json.dumps({'ts': int(time.time())})}\n\n"
+                    continue
+                except StopAsyncIteration:
+                    break
+                next_item_task = asyncio.create_task(stream_iter.__anext__())
                 if await request.is_disconnected():
+                    if not next_item_task.done():
+                        next_item_task.cancel()
                     break
                 if mode == "custom":
     return StreamingResponse(
         sse_generator(),
         media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
+            "Connection": "keep-alive",
+        },
     )

app/core/quality.py CHANGED Viewed

@@ -29,8 +29,13 @@ _HEDGE_PHRASES: tuple[str, ...] = (
     "does not provide",
     "does not offer",
     "no detailed information",
 )
 def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
     """
@@ -46,6 +51,8 @@ def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
     lowered = answer.lower()
     if any(phrase in lowered for phrase in _HEDGE_PHRASES):
         return True
     if chunks and not re.search(r"\[\d+\]", answer):
         return True
     if complexity == "complex" and len(answer.split()) < 30:

     "does not provide",
     "does not offer",
     "no detailed information",
+    "not explicitly state",
+    "not explicitly stated",
+    "cannot be verified",
 )
+_RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
 def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
     """
     lowered = answer.lower()
     if any(phrase in lowered for phrase in _HEDGE_PHRASES):
         return True
+    if _RAW_TAG_RE.search(answer):
+        return True
     if chunks and not re.search(r"\[\d+\]", answer):
         return True
     if complexity == "complex" and len(answer.split()) < 30:

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -17,6 +17,9 @@ logger = logging.getLogger(__name__)
 _THINK_COMPLETE_RE = re.compile(r"<think>[\s\S]*?</think>", re.DOTALL)
 _THINK_OPEN_RE = re.compile(r"<think>")
 _THINK_CLOSE_RE = re.compile(r"</think>")
 # Chars to buffer at Phase-2 chunk boundaries to prevent split closing tags
 # (e.g., one SSE chunk ends with "</thi", next starts with "nk>") from being
@@ -73,6 +76,10 @@ ANSWERING RULES — follow all of them every time:
    those facts — a short confident answer beats a padded hallucinated one.
 7. Vary your sentence openers. Never start two consecutive sentences with "Darshan".
 8. Length: 2–4 paragraphs for detailed topics; 1 paragraph for simple factual questions.
 RELEVANCE CHECK — do this BEFORE writing:
 - Examine EVERY passage, not just the first one. The most relevant passage may not be [1].
@@ -234,13 +241,67 @@ def _normalise_answer_text(answer: str, max_citation_index: int) -> str:
         idx = int(match.group(1))
         return f"[{idx}]" if 1 <= idx <= max_citation_index else ""
-    cleaned = re.sub(r"\[(\d+)\]", _keep_valid_citation, answer)
     cleaned = re.sub(r"(\[\d+\])(\1)+", r"\1", cleaned)
     cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
     cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
     return cleaned.strip()
 def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]:  # noqa: ANN001
     # Number of token chunks to buffer before deciding there is no CoT block.
     # Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
@@ -310,7 +371,8 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         # TextOps become [1] and [2] — the LLM cites both in the same sentence,
         # which looks like self-citing hallucination even though it is technically
         # correct.  _merge_by_source preserves all text; nothing is discarded.
-        merged_chunks = _merge_by_source(reranked_chunks)
         context_parts: list[str] = []
         source_refs: list[SourceRef] = []
@@ -457,6 +519,12 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         full_answer = _normalise_answer_text(full_answer, max_citation_index=len(source_refs))
         # Only surface sources the LLM actually cited, deduplicated by URL so
         # multiple chunks from the same document show as one source card.
         cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}

 _THINK_COMPLETE_RE = re.compile(r"<think>[\s\S]*?</think>", re.DOTALL)
 _THINK_OPEN_RE = re.compile(r"<think>")
 _THINK_CLOSE_RE = re.compile(r"</think>")
+_GEN_HTML_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
+_VERSION_PARITY_RE = re.compile(r"\b(up[- ]?to[- ]?date|latest|current|in sync|same version|version)\b", re.IGNORECASE)
+_WORD_RE = re.compile(r"[a-zA-Z0-9]+")
 # Chars to buffer at Phase-2 chunk boundaries to prevent split closing tags
 # (e.g., one SSE chunk ends with "</thi", next starts with "nk>") from being
    those facts — a short confident answer beats a padded hallucinated one.
 7. Vary your sentence openers. Never start two consecutive sentences with "Darshan".
 8. Length: 2–4 paragraphs for detailed topics; 1 paragraph for simple factual questions.
+9. If asked about freshness/version parity (e.g., "up-to-date", "same as demo"), and passages
+    do not explicitly confirm it, answer in at most 2 sentences: state what is known from passages,
+    then explicitly say it cannot be verified from indexed sources.
+10. Do not list unrelated projects or sources unless the user asked for a list/compare.
 RELEVANCE CHECK — do this BEFORE writing:
 - Examine EVERY passage, not just the first one. The most relevant passage may not be [1].
         idx = int(match.group(1))
         return f"[{idx}]" if 1 <= idx <= max_citation_index else ""
+    cleaned = _GEN_HTML_TAG_RE.sub("", answer)
+    cleaned = re.sub(r"\[(\d+)\]", _keep_valid_citation, cleaned)
     cleaned = re.sub(r"(\[\d+\])(\1)+", r"\1", cleaned)
     cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
     cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
     return cleaned.strip()
+def _build_low_trust_fallback(query: str, source_refs: list[SourceRef]) -> str:
+    """Deterministic concise fallback when model output still fails trust checks."""
+    if not source_refs:
+        return _NOT_FOUND_ANSWER
+    first = source_refs[0]
+    title = first.title or "the retrieved source"
+    if _VERSION_PARITY_RE.search(query):
+        return (
+            f"The retrieved sources confirm links/details for {title} [1], but they do not explicitly "
+            "confirm whether the GitHub code and live demo are currently in sync, so version parity "
+            "cannot be verified from the indexed content alone [1]."
+        )
+    return (
+        f"Based on the retrieved evidence, the answer is grounded in {title} [1]. "
+        "If you want deeper detail, ask for a specific section, implementation part, or comparison."
+    )
+def _select_chunks_for_prompt(query: str, reranked_chunks: list[dict]) -> list[dict]:
+    """
+    Prefer chunks whose source title is explicitly referenced in the query.
+    This prevents focused questions (e.g. one project) from receiving multi-project
+    blended context that can trigger verbose, low-quality comparison answers.
+    """
+    if not reranked_chunks:
+        return reranked_chunks
+    query_lower = query.lower()
+    focused: list[dict] = []
+    for chunk in reranked_chunks:
+        title = str(chunk["metadata"].get("source_title", "")).strip()
+        if not title:
+            continue
+        title_lower = title.lower()
+        if len(title_lower) >= 4 and title_lower in query_lower:
+            focused.append(chunk)
+            continue
+        title_tokens = [t for t in _WORD_RE.findall(title_lower) if len(t) >= 4]
+        if title_tokens and sum(1 for tok in title_tokens if tok in query_lower) >= min(2, len(title_tokens)):
+            focused.append(chunk)
+    if focused:
+        return focused[:6]
+    return reranked_chunks[:8]
 def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]:  # noqa: ANN001
     # Number of token chunks to buffer before deciding there is no CoT block.
     # Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
         # TextOps become [1] and [2] — the LLM cites both in the same sentence,
         # which looks like self-citing hallucination even though it is technically
         # correct.  _merge_by_source preserves all text; nothing is discarded.
+        selected_chunks = _select_chunks_for_prompt(query, reranked_chunks)
+        merged_chunks = _merge_by_source(selected_chunks)
         context_parts: list[str] = []
         source_refs: list[SourceRef] = []
         full_answer = _normalise_answer_text(full_answer, max_citation_index=len(source_refs))
+        # Final guardrail: if answer still looks low-trust after reformat + cleanup,
+        # return a concise deterministic fallback anchored to retrieved sources.
+        if is_low_trust(full_answer, reranked_chunks, complexity):
+            logger.debug("Final low-trust guard triggered; using deterministic fallback.")
+            full_answer = _build_low_trust_fallback(query, source_refs)
         # Only surface sources the LLM actually cited, deduplicated by URL so
         # multiple chunks from the same document show as one source card.
         cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}

tests/test_chat_stream_reliability.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import asyncio
+import json
+import time
+from unittest.mock import MagicMock, patch
+from fastapi.testclient import TestClient
+from jose import jwt
+def _make_token() -> str:
+    payload = {"sub": "test-user", "exp": int(time.time()) + 3600}
+    return jwt.encode(payload, "test-secret-32-chars-long-0000000", algorithm="HS256")
+def _chat(client: TestClient, message: str) -> str:
+    token = _make_token()
+    response = client.post(
+        "/chat",
+        json={"message": message, "session_id": "a1b2c3d4-e5f6-4789-8abc-def012345678"},
+        headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
+    )
+    assert response.status_code == 200
+    return response.text
+def test_stream_emits_early_status_and_heartbeat(monkeypatch):
+    # Reduce heartbeat interval so the test can verify keepalive quickly.
+    monkeypatch.setattr("app.api.chat._SSE_HEARTBEAT_SECONDS", 0.05)
+    mock_pipeline = MagicMock()
+    async def delayed_astream(state, stream_mode=None):
+        await asyncio.sleep(0.12)
+        yield ("custom", {"type": "status", "label": "Thinking..."})
+        yield ("custom", {"type": "token", "text": "Answer text."})
+        yield ("updates", {"generate": {"answer": "Answer text.", "sources": []}})
+    mock_pipeline.astream = delayed_astream
+    with patch("app.main.build_pipeline", return_value=mock_pipeline), \
+         patch("app.main.QdrantClient"), \
+         patch("app.services.embedder.Embedder"), \
+         patch("app.services.reranker.Reranker"):
+        from app.main import create_app
+        app = create_app()
+        app.state.pipeline = mock_pipeline
+        with TestClient(app, raise_server_exceptions=True) as client:
+            body = _chat(client, "Tell me about TextOps")
+    assert "event: status" in body
+    assert "Starting response..." in body
+    assert "event: ping" in body
+    assert "Answer text." in body

tests/test_generate_focus_selection.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from app.pipeline.nodes.generate import _select_chunks_for_prompt
+def _chunk(title: str, section: str = "Overview") -> dict:
+    return {
+        "text": f"Info about {title}",
+        "metadata": {
+            "source_title": title,
+            "section": section,
+            "source_url": "",
+            "source_type": "project",
+        },
+    }
+def test_select_chunks_prefers_explicitly_mentioned_source_title() -> None:
+    chunks = [
+        _chunk("Sorting Demo"),
+        _chunk("EchoEcho"),
+        _chunk("Donut.asm"),
+    ]
+    selected = _select_chunks_for_prompt(
+        "Is the source code on GitHub up-to-date with the Sorting Demo live demo?",
+        chunks,
+    )
+    assert selected
+    assert all(c["metadata"]["source_title"] == "Sorting Demo" for c in selected)
+def test_select_chunks_falls_back_to_top_ranked_when_no_title_match() -> None:
+    chunks = [_chunk("A"), _chunk("B"), _chunk("C")]
+    selected = _select_chunks_for_prompt("What technologies are used?", chunks)
+    assert selected == chunks

tests/test_generate_quality_fallback.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from app.models.chat import SourceRef
+from app.pipeline.nodes.generate import _build_low_trust_fallback
+def test_low_trust_fallback_for_version_parity_queries() -> None:
+    sources = [
+        SourceRef(
+            title="Sorting Demo",
+            url="https://github.com/1337Xcode/sortingdemo",
+            section="Overview",
+            source_type="project",
+        )
+    ]
+    answer = _build_low_trust_fallback(
+        "Is the source code up-to-date with the online demo version?",
+        sources,
+    )
+    assert "cannot be verified" in answer
+    assert "[1]" in answer
+def test_low_trust_fallback_general_query_is_concise() -> None:
+    sources = [
+        SourceRef(
+            title="Sorting Demo",
+            url="https://github.com/1337Xcode/sortingdemo",
+            section="Overview",
+            source_type="project",
+        )
+    ]
+    answer = _build_low_trust_fallback(
+        "What technology is used to build Sorting Demo?",
+        sources,
+    )
+    assert "Sorting Demo" in answer
+    assert "[1]" in answer

tests/test_parser_config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+from ingestion.parser_config import load_parser_config
+def test_parser_config_defaults_all_enabled(monkeypatch) -> None:
+    for key in (
+        "INGEST_ENABLE_BLOG_MDX",
+        "INGEST_ENABLE_PROJECT_MDX",
+        "INGEST_ENABLE_PDF",
+        "INGEST_ENABLE_GITHUB_README",
+    ):
+        monkeypatch.delenv(key, raising=False)
+    cfg = load_parser_config()
+    assert cfg.enable_blog_mdx is True
+    assert cfg.enable_project_mdx is True
+    assert cfg.enable_pdf is True
+    assert cfg.enable_github_readme is True
+def test_parser_config_can_disable_selective_parsers(monkeypatch) -> None:
+    monkeypatch.setenv("INGEST_ENABLE_BLOG_MDX", "false")
+    monkeypatch.setenv("INGEST_ENABLE_PROJECT_MDX", "true")
+    monkeypatch.setenv("INGEST_ENABLE_PDF", "0")
+    monkeypatch.setenv("INGEST_ENABLE_GITHUB_README", "yes")
+    cfg = load_parser_config()
+    assert cfg.enable_blog_mdx is False
+    assert cfg.enable_project_mdx is True
+    assert cfg.enable_pdf is False
+    assert cfg.enable_github_readme is True

tests/test_parser_sanitization.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[2]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+from ingestion.parsers.readme_parser import parse_readme_bytes
+from ingestion.parsers.text_sanitizer import strip_html_tags
+def test_strip_html_tags_removes_img_and_comments() -> None:
+    text = "Hello <!-- comment --> <img src='x'> world <b>bold</b>"
+    cleaned = strip_html_tags(text)
+    assert "<img" not in cleaned
+    assert "<!--" not in cleaned
+    assert "<b>" not in cleaned
+    assert "Hello" in cleaned and "world" in cleaned
+def test_parse_readme_bytes_removes_raw_html() -> None:
+    readme = b"# Repo\n\n<img src='banner.png'/>\n\nSome content"
+    parsed = parse_readme_bytes(readme, repo_name="1337Xcode/demo")
+    assert "<img" not in parsed["clean_content"]
+    assert "Some content" in parsed["clean_content"]