GitHub Actions commited on
Commit
0da0699
·
1 Parent(s): 4ef165a

Deploy 27439fc

Browse files
app/api/chat.py CHANGED
@@ -194,6 +194,8 @@ async def chat_endpoint(
194
  "critic_completeness": None,
195
  "critic_specificity": None,
196
  "critic_quality": None,
 
 
197
  }
198
 
199
  async def sse_generator():
 
194
  "critic_completeness": None,
195
  "critic_specificity": None,
196
  "critic_quality": None,
197
+ # Fix 1: enumeration classifier — populated by enumerate_query node
198
+ "is_enumeration_query": False,
199
  }
200
 
201
  async def sse_generator():
app/core/portfolio_context.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ backend/app/core/portfolio_context.py
3
+
4
+ Known portfolio entities extracted from the TOON context file.
5
+
6
+ Two purposes:
7
+ 1. Fix 2 Rule 1 — CRAG routing: detect whether a failed query is asking
8
+ about something genuinely in the portfolio. When the first CRAG retry
9
+ also fails, a second retry is allowed for queries that mention known
10
+ entities. This prevents the not-found response from firing on queries
11
+ that should have findings (e.g. "how does textops work?").
12
+
13
+ 2. Fix 2 Rule 2 — Not-found specific suggestion: the generate node passes
14
+ the TOON entity list to Gemini so it can produce a specific redirect like
15
+ "Try asking about his TextOps Kubernetes setup" rather than the generic
16
+ "ask about his projects".
17
+
18
+ Entity list is manually maintained from the TOON context file and must be
19
+ updated whenever refresh_gemini_context.py adds new content.
20
+ Deliberate duplication: the TOON file is runtime state (may be absent in tests);
21
+ this module is compile-time — no file I/O, no latency, no failure mode.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Known project names (as they appear in the TOON file and corpus)
27
+ # ---------------------------------------------------------------------------
28
+ KNOWN_PROJECTS: frozenset[str] = frozenset({
29
+ "textops", "text ops",
30
+ "echo-echo", "echo echo",
31
+ "localhost",
32
+ "donut-asm", "donut asm", "donut.c", "donut",
33
+ "save-the-planet", "save the planet",
34
+ "sorting-demo", "sorting demo",
35
+ "student-management-system", "student management system",
36
+ "sysphus",
37
+ "personabot", "persona bot",
38
+ })
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Known technologies (canonical forms + common abbreviations)
42
+ # ---------------------------------------------------------------------------
43
+ KNOWN_TECHNOLOGIES: frozenset[str] = frozenset({
44
+ # Languages
45
+ "python", "go", "golang", "java", "javascript", "typescript",
46
+ "assembly", "x86", "sql", "html", "css",
47
+ # Frameworks / libraries
48
+ "fastapi", "react", "node.js", "nodejs", "express", "ejs",
49
+ "langgraph", "langchain", "pydantic",
50
+ # Infra / cloud
51
+ "docker", "kubernetes", "aws", "gcp", "terraform", "ci/cd", "gitlab",
52
+ "github actions", "nginx",
53
+ # ML / AI
54
+ "yolo", "yolov8", "ncnn", "onnx",
55
+ "rag", "llm", "llms", "groq", "gemini", "qdrant",
56
+ "sentence-transformers", "bge", "cross-encoder", "bm25",
57
+ # Networking / P2P
58
+ "webrtc", "kademlia", "tor", "dht", "p2p",
59
+ # Database
60
+ "sqlite", "postgres", "postgresql", "mysql", "mongodb", "orm",
61
+ # Testing
62
+ "junit", "pytest",
63
+ "jwt", "owasp",
64
+ # Monitoring
65
+ "prometheus", "mlflow", "dagshub",
66
+ # Misc
67
+ "microservices", "serverless", "e2ee",
68
+ })
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # Known companies / educational institutions
72
+ # ---------------------------------------------------------------------------
73
+ KNOWN_ORGS: frozenset[str] = frozenset({
74
+ # Employment (update from TOON / resume as new roles are indexed)
75
+ "vk live", "vklive",
76
+ # Education
77
+ "university",
78
+ # Platforms / services
79
+ "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
80
+ })
81
+
82
+ # ---------------------------------------------------------------------------
83
+ # All known portfolio nouns in one flat set for O(1) membership checks
84
+ # ---------------------------------------------------------------------------
85
+ ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS
86
+
87
+ # Compact context block passed to Gemini when generating a specific not-found
88
+ # suggestion. One sentence per major entity class — tight token budget.
89
+ SUGGESTION_HINT: str = (
90
+ "Darshan's portfolio includes: "
91
+ "projects (TextOps, Echo-Echo, Localhost, Donut-ASM, Sysphus, Save the Planet, Sorting Demo, "
92
+ "Student Management System, PersonaBot); "
93
+ "skills and technologies (Python, Go, FastAPI, LangGraph, RAG, Qdrant, Groq, Docker, Kubernetes, "
94
+ "AWS, WebRTC, Kademlia DHT, YOLOv8, Assembly x86, Java, React, Node.js); "
95
+ "blog posts (60 FPS Object Detection on Android, Prompt Engineering Jailbreaks); "
96
+ "work experience and education (ask about his resume/CV for employer details)."
97
+ )
98
+
99
+
100
+ def is_portfolio_relevant(query: str) -> bool:
101
+ """
102
+ Return True when the query mentions at least one known portfolio entity.
103
+
104
+ Used by graph routing (Fix 2 Rule 1) to decide whether a second CRAG
105
+ retry is warranted after the first retry also found nothing.
106
+
107
+ Token-level check: split on non-alphanumeric, lowercase, check membership.
108
+ ~5µs per call on a 20-token query — zero latency impact.
109
+ """
110
+ import re
111
+ tokens = re.findall(r"[a-z0-9]+", query.lower())
112
+ # Single-token check
113
+ for token in tokens:
114
+ if token in ALL_PORTFOLIO_NOUNS:
115
+ return True
116
+ # Bigram check — catches "vk live", "text ops", "echo echo"
117
+ for a, b in zip(tokens, tokens[1:]):
118
+ if f"{a} {b}" in ALL_PORTFOLIO_NOUNS:
119
+ return True
120
+ return False
app/models/pipeline.py CHANGED
@@ -82,3 +82,9 @@ class PipelineState(TypedDict):
82
  critic_completeness: Optional[int] # answer uses all relevant available chunks
83
  critic_specificity: Optional[int] # answer contains specific names/numbers
84
  critic_quality: Optional[str] # "high" | "medium" | "low"
 
 
 
 
 
 
 
82
  critic_completeness: Optional[int] # answer uses all relevant available chunks
83
  critic_specificity: Optional[int] # answer contains specific names/numbers
84
  critic_quality: Optional[str] # "high" | "medium" | "low"
85
+ # Fix 1: Enumeration query classifier.
86
+ # True when the query has enumeration intent ("list all projects", "how many blogs").
87
+ # The enumerate_query node skips semantic retrieval and does a Qdrant payload-filter
88
+ # scroll instead, returning a complete deduplicated title list.
89
+ # Logged to SQLite so enumeration turns can be monitored separately from RAG turns.
90
+ is_enumeration_query: bool
app/pipeline/graph.py CHANGED
@@ -4,11 +4,13 @@ from langgraph.graph.state import CompiledStateGraph
4
  from app.models.pipeline import PipelineState
5
  from app.pipeline.nodes.guard import make_guard_node
6
  from app.pipeline.nodes.cache import make_cache_node
 
7
  from app.pipeline.nodes.gemini_fast import make_gemini_fast_node
8
  from app.pipeline.nodes.retrieve import make_retrieve_node
9
  from app.pipeline.nodes.rewrite_query import make_rewrite_query_node, _has_meaningful_token
10
  from app.pipeline.nodes.generate import make_generate_node
11
  from app.pipeline.nodes.log_eval import make_log_eval_node
 
12
 
13
  # Relevance gate threshold — matches retrieve.py constant.
14
  _MIN_TOP_SCORE: float = -3.5
@@ -28,6 +30,20 @@ def route_guard(state: PipelineState) -> str:
28
  return "block"
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def route_cache(state: PipelineState) -> str:
32
  if state.get("cached", False):
33
  return "hit"
@@ -48,44 +64,70 @@ def route_gemini(state: PipelineState) -> str:
48
  def route_retrieve_result(state: PipelineState) -> str:
49
  """
50
  CRAG routing: trigger a query rewrite when retrieval was weak or empty.
51
- Exactly one retry is permitted; retrieval_attempts tracks this.
52
 
53
- Rewrite conditions (first attempt only, meaningful query tokens required):
54
- 1. reranked_chunks is empty (nothing above the -3.5 threshold).
55
- 2. reranked_chunks is non-empty but the top cross-encoder score is below
56
- _CRAG_LOW_CONFIDENCE_SCORE (-1.5), indicating borderline retrieval where
57
- a different query phrasing would likely produce much better matches.
 
 
 
 
 
 
 
 
 
58
  """
59
  attempts = state.get("retrieval_attempts", 1)
60
  reranked = state.get("reranked_chunks", [])
61
- if attempts == 1 and _has_meaningful_token(state.get("query", "")):
 
 
 
62
  if not reranked:
63
  return "rewrite"
64
  top_score = state.get("top_rerank_score")
65
  if top_score is not None and top_score < _CRAG_LOW_CONFIDENCE_SCORE:
66
  return "rewrite"
 
 
 
 
 
 
 
 
 
67
  return "generate"
68
 
69
 
70
  def build_pipeline(services: dict) -> CompiledStateGraph:
71
  graph = StateGraph(PipelineState)
72
 
73
- graph.add_node("guard", make_guard_node(services["classifier"]))
74
- graph.add_node("cache", make_cache_node(services["cache"], services["embedder"]))
75
- graph.add_node("gemini_fast", make_gemini_fast_node(services["gemini"]))
76
- graph.add_node("retrieve", make_retrieve_node(
77
- services["vector_store"],
78
- services["embedder"],
79
- services["reranker"]))
80
- # CRAG: one query rewrite on failed retrieval — then retrieve runs a second time.
81
- graph.add_node("rewrite_query", make_rewrite_query_node(services["gemini"]))
82
- graph.add_node("generate", make_generate_node(services["llm"], services["gemini"]))
83
- graph.add_node("log_eval", make_log_eval_node(services["db_path"], services.get("github_log")))
 
84
 
85
  graph.set_entry_point("guard")
86
 
87
  graph.add_conditional_edges("guard", route_guard,
88
- {"pass": "cache", "block": "log_eval"})
 
 
 
 
 
89
 
90
  graph.add_conditional_edges("cache", route_cache,
91
  {"hit": "log_eval", "miss": "gemini_fast"})
@@ -93,11 +135,12 @@ def build_pipeline(services: dict) -> CompiledStateGraph:
93
  graph.add_conditional_edges("gemini_fast", route_gemini,
94
  {"answered": "log_eval", "research": "retrieve"})
95
 
96
- # After retrieve: either run CRAG rewrite (one retry) or proceed to generate.
 
97
  graph.add_conditional_edges("retrieve", route_retrieve_result,
98
  {"rewrite": "rewrite_query", "generate": "generate"})
99
 
100
- # After rewrite: go straight back to retrieve for the second attempt.
101
  # The cycle terminates because route_retrieve_result checks retrieval_attempts.
102
  graph.add_edge("rewrite_query", "retrieve")
103
 
 
4
  from app.models.pipeline import PipelineState
5
  from app.pipeline.nodes.guard import make_guard_node
6
  from app.pipeline.nodes.cache import make_cache_node
7
+ from app.pipeline.nodes.enumerate_query import make_enumerate_query_node
8
  from app.pipeline.nodes.gemini_fast import make_gemini_fast_node
9
  from app.pipeline.nodes.retrieve import make_retrieve_node
10
  from app.pipeline.nodes.rewrite_query import make_rewrite_query_node, _has_meaningful_token
11
  from app.pipeline.nodes.generate import make_generate_node
12
  from app.pipeline.nodes.log_eval import make_log_eval_node
13
+ from app.core.portfolio_context import is_portfolio_relevant
14
 
15
  # Relevance gate threshold — matches retrieve.py constant.
16
  _MIN_TOP_SCORE: float = -3.5
 
30
  return "block"
31
 
32
 
33
+ def route_enumerate(state: PipelineState) -> str:
34
+ """
35
+ Fix 1: after the enumerate_query node, decide whether to skip the normal
36
+ retrieval pipeline and go straight to generate.
37
+
38
+ "skip_to_generate" — enumeration intent detected; reranked_chunks is already
39
+ populated with the complete Qdrant scroll result.
40
+ "continue" — no enumeration intent; proceed to cache → gemini_fast → retrieve.
41
+ """
42
+ if state.get("is_enumeration_query", False):
43
+ return "skip_to_generate"
44
+ return "continue"
45
+
46
+
47
  def route_cache(state: PipelineState) -> str:
48
  if state.get("cached", False):
49
  return "hit"
 
64
  def route_retrieve_result(state: PipelineState) -> str:
65
  """
66
  CRAG routing: trigger a query rewrite when retrieval was weak or empty.
 
67
 
68
+ Fix 2 Rule 1: portfolio-noun queries are allowed a SECOND CRAG retry after
69
+ the first retry also finds nothing. This prevents the not-found response from
70
+ firing on queries where the corpus genuinely should have results (e.g. a typo
71
+ in a project name or a synonym mismatch that's specific to portfolio content).
72
+
73
+ Attempt tracking (via retrieval_attempts):
74
+ First retrieve → retrieval_attempts = 1
75
+ First rewrite → retrieval_attempts = 2 (rewrite_query increments by +1)
76
+ Second retrieve → retrieval_attempts = 3
77
+ Second rewrite → retrieval_attempts = 4 (portfolio queries only)
78
+ Third retrieve → retrieval_attempts = 5
79
+
80
+ Any attempt ≥ 5 (or ≥ 3 for non-portfolio queries) goes to generate.
81
+ Routing terminates because retrieval_attempts grows monotonically.
82
  """
83
  attempts = state.get("retrieval_attempts", 1)
84
  reranked = state.get("reranked_chunks", [])
85
+ query = state.get("query", "")
86
+
87
+ # First CRAG attempt — applies to all queries with meaningful tokens.
88
+ if attempts == 1 and _has_meaningful_token(query):
89
  if not reranked:
90
  return "rewrite"
91
  top_score = state.get("top_rerank_score")
92
  if top_score is not None and top_score < _CRAG_LOW_CONFIDENCE_SCORE:
93
  return "rewrite"
94
+
95
+ # Fix 2 Rule 1: second CRAG attempt for portfolio-noun queries only.
96
+ # attempts==3 means: first retrieve failed → rewrite fired → second retrieve
97
+ # also failed (still empty after the first CRAG rewrite). When the query
98
+ # mentions a known portfolio entity, attempt one more vocabulary-shifted rewrite
99
+ # before admitting the not-found path.
100
+ if attempts == 3 and not reranked and is_portfolio_relevant(query):
101
+ return "rewrite"
102
+
103
  return "generate"
104
 
105
 
106
  def build_pipeline(services: dict) -> CompiledStateGraph:
107
  graph = StateGraph(PipelineState)
108
 
109
+ graph.add_node("guard", make_guard_node(services["classifier"]))
110
+ graph.add_node("enumerate_query", make_enumerate_query_node(services["vector_store"]))
111
+ graph.add_node("cache", make_cache_node(services["cache"], services["embedder"]))
112
+ graph.add_node("gemini_fast", make_gemini_fast_node(services["gemini"]))
113
+ graph.add_node("retrieve", make_retrieve_node(
114
+ services["vector_store"],
115
+ services["embedder"],
116
+ services["reranker"]))
117
+ # CRAG: query rewrite on failed retrieval — runs up to twice for portfolio queries.
118
+ graph.add_node("rewrite_query", make_rewrite_query_node(services["gemini"]))
119
+ graph.add_node("generate", make_generate_node(services["llm"], services["gemini"]))
120
+ graph.add_node("log_eval", make_log_eval_node(services["db_path"], services.get("github_log")))
121
 
122
  graph.set_entry_point("guard")
123
 
124
  graph.add_conditional_edges("guard", route_guard,
125
+ {"pass": "enumerate_query", "block": "log_eval"})
126
+
127
+ # Fix 1: enumerate_query either skips straight to generate (full list fetched)
128
+ # or falls through to the normal cache → gemini_fast → retrieve pipeline.
129
+ graph.add_conditional_edges("enumerate_query", route_enumerate,
130
+ {"skip_to_generate": "generate", "continue": "cache"})
131
 
132
  graph.add_conditional_edges("cache", route_cache,
133
  {"hit": "log_eval", "miss": "gemini_fast"})
 
135
  graph.add_conditional_edges("gemini_fast", route_gemini,
136
  {"answered": "log_eval", "research": "retrieve"})
137
 
138
+ # After retrieve: either run CRAG rewrite (up to twice for portfolio queries)
139
+ # or proceed to generate.
140
  graph.add_conditional_edges("retrieve", route_retrieve_result,
141
  {"rewrite": "rewrite_query", "generate": "generate"})
142
 
143
+ # After rewrite: go straight back to retrieve for the next attempt.
144
  # The cycle terminates because route_retrieve_result checks retrieval_attempts.
145
  graph.add_edge("rewrite_query", "retrieve")
146
 
app/pipeline/nodes/enumerate_query.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ backend/app/pipeline/nodes/enumerate_query.py
3
+
4
+ Fix 1 — Enumeration Query Classifier and Metadata Retrieval.
5
+
6
+ Inserted immediately after the Guard node (before Cache / Gemini fast-path).
7
+ When enumeration intent is detected, this node queries Qdrant using a
8
+ payload filter on metadata.source_type — no vector embedding, no reranker.
9
+ It then deduplicates by source_title, sorts alphabetically, and populates
10
+ reranked_chunks so the Generate node receives the complete, accurate list.
11
+
12
+ Why a database filter beats similarity search for enumeration:
13
+ Semantic retrieval cannot guarantee completeness — it finds the top-K
14
+ most similar chunks, not ALL matching chunks. "List all my projects"
15
+ with top_k=20 and 8 projects in the corpus would return the 8 most
16
+ similar to the query vector, but which 8 depends on the embedding.
17
+ A payload filter returns every matching point, regardless of embedding
18
+ position. Completeness is guaranteed; the cosine metric is irrelevant.
19
+
20
+ Cost: 0 embedding calls, 0 reranker calls, 1 Qdrant scroll.
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+ import re
26
+ from typing import Callable
27
+
28
+ from langgraph.config import get_stream_writer
29
+
30
+ from app.models.pipeline import PipelineState, Chunk
31
+ from app.services.vector_store import VectorStore
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Enumeration intent patterns
37
+ # ---------------------------------------------------------------------------
38
+ # Each pattern is checked against the lowercased, whitespace-normalised query.
39
+ # Order matters: more specific patterns are checked first.
40
+ _ENUM_PREFIXES: tuple[str, ...] = (
41
+ "list all",
42
+ "list the",
43
+ "list every",
44
+ "list your",
45
+ "list his",
46
+ "list ",
47
+ "show all",
48
+ "show me all",
49
+ "show every",
50
+ "give me all",
51
+ "give me a list",
52
+ "what are all",
53
+ "what are your",
54
+ "what are his",
55
+ "how many",
56
+ "count ",
57
+ "count of",
58
+ "enumerate",
59
+ "name all",
60
+ "name every",
61
+ )
62
+
63
+ # Trailing pattern: "what [are|were|is] all the <noun>?"
64
+ _ENUM_TRAILING_RE = re.compile(
65
+ r"(?:what|which)\s+(?:are|were|is|were)\s+all\s+(?:the\s+)?",
66
+ re.IGNORECASE,
67
+ )
68
+
69
+
70
+ def _has_enumeration_intent(query: str) -> bool:
71
+ """
72
+ Return True when the lowercased query signals enumeration intent.
73
+ Pure string ops — no LLM, no embedding. Runs in < 5µs.
74
+ """
75
+ q = " ".join(query.lower().split()) # normalise whitespace
76
+ for prefix in _ENUM_PREFIXES:
77
+ if q.startswith(prefix) or f" {prefix}" in q:
78
+ return True
79
+ if _ENUM_TRAILING_RE.search(q):
80
+ return True
81
+ return False
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # Entity-type extractor
86
+ # ---------------------------------------------------------------------------
87
+ # Maps query tokens → Qdrant source_type values.
88
+ # "all source types" is represented as an empty list (caller scrolls without filter).
89
+ _TYPE_MAP: dict[str, list[str]] = {
90
+ "project": ["project"],
91
+ "projects": ["project"],
92
+ "blog": ["blog"],
93
+ "blogs": ["blog"],
94
+ "post": ["blog"],
95
+ "posts": ["blog"],
96
+ "article": ["blog"],
97
+ "articles": ["blog"],
98
+ "writing": ["blog"],
99
+ "writings": ["blog"],
100
+ "experience": ["cv", "bio"],
101
+ "experiences": ["cv", "bio"],
102
+ "work": ["cv", "bio"],
103
+ "jobs": ["cv", "bio"],
104
+ "job": ["cv", "bio"],
105
+ "role": ["cv", "bio"],
106
+ "roles": ["cv", "bio"],
107
+ "company": ["cv", "bio"],
108
+ "companies": ["cv", "bio"],
109
+ "skills": ["cv", "project", "blog"],
110
+ "skill": ["cv", "project", "blog"],
111
+ "technologies": ["cv", "project", "blog"],
112
+ "technology": ["cv", "project", "blog"],
113
+ "tech": ["cv", "project", "blog"],
114
+ "tools": ["cv", "project", "blog"],
115
+ "readme": ["github"],
116
+ "repositories": ["github"],
117
+ "repos": ["github"],
118
+ }
119
+
120
+
121
+ def _extract_source_types(query: str) -> list[str]:
122
+ """
123
+ Map query vocabulary to Qdrant source_type values.
124
+ Returns a deduplicated list. An empty list means "all types".
125
+ """
126
+ tokens = re.findall(r"[a-z]+", query.lower())
127
+ found: list[str] = []
128
+ seen: set[str] = set()
129
+ for tok in tokens:
130
+ for st in _TYPE_MAP.get(tok, []):
131
+ if st not in seen:
132
+ seen.add(st)
133
+ found.append(st)
134
+ # If no specific type matched, return empty (= all types).
135
+ return found
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # Source type display label (used in status event)
140
+ # ---------------------------------------------------------------------------
141
+ _TYPE_LABEL: dict[str, str] = {
142
+ "project": "projects",
143
+ "blog": "blog posts",
144
+ "cv": "CV/experience",
145
+ "bio": "background",
146
+ "github": "GitHub repos",
147
+ }
148
+
149
+
150
+ def _label_for_types(source_types: list[str]) -> str:
151
+ if not source_types:
152
+ return "all portfolio content"
153
+ return " and ".join(_TYPE_LABEL.get(st, st) for st in source_types[:2])
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Node factory
158
+ # ---------------------------------------------------------------------------
159
+
160
+ def make_enumerate_query_node(vector_store: VectorStore) -> Callable[[PipelineState], dict]:
161
+ """
162
+ Returns a LangGraph node that:
163
+ 1. Classifies whether the query has enumeration intent.
164
+ 2. If yes: scrolls Qdrant by source_type, deduplicates by title,
165
+ populates reranked_chunks, sets is_enumeration_query=True.
166
+ 3. If no: passes through with is_enumeration_query=False so the
167
+ rest of the pipeline (cache → gemini_fast → retrieve) runs normally.
168
+
169
+ No I/O unless enumeration intent is detected.
170
+ """
171
+
172
+ def enumerate_query_node(state: PipelineState) -> dict:
173
+ writer = get_stream_writer()
174
+ query = state["query"]
175
+
176
+ if not _has_enumeration_intent(query):
177
+ return {"is_enumeration_query": False}
178
+
179
+ # Enumeration intent confirmed.
180
+ source_types = _extract_source_types(query)
181
+ label = _label_for_types(source_types)
182
+ writer({"type": "status", "label": f"Fetching complete list of {label}..."})
183
+
184
+ # Scroll Qdrant — payload filter, no vector.
185
+ all_chunks = vector_store.scroll_by_source_type(
186
+ source_types=source_types or ["project", "blog", "cv", "bio", "github"],
187
+ )
188
+
189
+ if not all_chunks:
190
+ # Nothing in the corpus yet — let the normal pipeline handle it.
191
+ logger.info("Enumeration scroll returned 0 results; falling back to RAG path.")
192
+ return {"is_enumeration_query": False}
193
+
194
+ # Deduplicate by source_title (many chunks per document; we want title-level list).
195
+ seen_titles: set[str] = set()
196
+ unique_by_title: list[Chunk] = []
197
+ for chunk in all_chunks:
198
+ title = chunk["metadata"].get("source_title", "").strip()
199
+ if title and title not in seen_titles:
200
+ seen_titles.add(title)
201
+ unique_by_title.append(chunk)
202
+
203
+ # Sort alphabetically by title for stable output.
204
+ unique_by_title.sort(key=lambda c: c["metadata"].get("source_title", "").lower())
205
+
206
+ logger.info(
207
+ "Enumeration: query=%r source_types=%r → %d unique titles",
208
+ query, source_types, len(unique_by_title),
209
+ )
210
+
211
+ # Emit one "reading" event per unique source so the frontend's source card
212
+ # row is populated (mirrors the retrieve node's contract).
213
+ seen_urls: set[str] = set()
214
+ for chunk in unique_by_title:
215
+ meta = chunk["metadata"]
216
+ url = meta.get("source_url") or ""
217
+ dedup_key = url or meta.get("doc_id", "")
218
+ if dedup_key and dedup_key not in seen_urls:
219
+ seen_urls.add(dedup_key)
220
+ writer({
221
+ "type": "reading",
222
+ "title": meta.get("source_title", ""),
223
+ "url": url or None,
224
+ "source_type": meta.get("source_type", ""),
225
+ })
226
+
227
+ writer({"type": "status", "label": f"Found {len(unique_by_title)} items — composing list..."})
228
+
229
+ return {
230
+ "is_enumeration_query": True,
231
+ "reranked_chunks": unique_by_title,
232
+ # Mark path early so log_eval tags enumeration turns separately.
233
+ "path": "enumeration",
234
+ }
235
+
236
+ return enumerate_query_node
app/pipeline/nodes/generate.py CHANGED
@@ -8,6 +8,7 @@ from app.models.chat import SourceRef
8
  from app.models.pipeline import PipelineState
9
  from app.services.llm_client import LLMClient
10
  from app.core.quality import is_low_trust
 
11
 
12
  logger = logging.getLogger(__name__)
13
 
@@ -84,16 +85,36 @@ CRITICAL SAFETY RULES — override everything above:
84
  # context here, so anything specific it says would be fabricated.
85
  _NOT_FOUND_SYSTEM = """\
86
  You are the assistant on Darshan Chheda's portfolio website.
87
- The knowledge base search returned no relevant results for this question.
88
-
89
- Respond in 1-2 natural sentences. Use fresh wording each time do not start with
90
- "I don't have information about". Acknowledge that specific information isn't indexed
91
- right now, then invite the visitor to ask about {topics}.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- CRITICAL: Do NOT name any specific project, technology, company, blog post, or skill.
94
- You have NO retrieved facts any specific name you produce is fabricated.
95
- No apologies, no padding, vary your phrasing.
96
- """.format(topics=_TOPIC_SUGGESTIONS)
 
 
 
 
 
97
 
98
 
99
  def _format_history(state: "PipelineState") -> str:
@@ -135,12 +156,71 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
135
  complexity = state.get("query_complexity", "simple")
136
  reranked_chunks = state.get("reranked_chunks", [])
137
 
138
- # ── Not-found path ─────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  if not reranked_chunks:
140
  writer({"type": "status", "label": "Could not find specific information, responding carefully..."})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  history_prefix = _format_history(state)
 
 
 
 
142
  stream = llm_client.complete_with_complexity(
143
- prompt=f"{history_prefix}Visitor question: {query}",
144
  system=_NOT_FOUND_SYSTEM,
145
  stream=True,
146
  complexity="simple",
 
8
  from app.models.pipeline import PipelineState
9
  from app.services.llm_client import LLMClient
10
  from app.core.quality import is_low_trust
11
+ from app.core.portfolio_context import SUGGESTION_HINT
12
 
13
  logger = logging.getLogger(__name__)
14
 
 
85
  # context here, so anything specific it says would be fabricated.
86
  _NOT_FOUND_SYSTEM = """\
87
  You are the assistant on Darshan Chheda's portfolio website.
88
+ The knowledge base search returned no relevant results for this question even after a retry.
89
+
90
+ You will be given one specific suggestion to offer (generated from the real portfolio index).
91
+ Respond in 1-2 natural sentences:
92
+ 1. Acknowledge that specific information isn't indexed right now.
93
+ 2. End with the specific suggestion provided after SUGGESTION:.
94
+
95
+ Rules:
96
+ - Use fresh wording each time — do not start with "I don't have information about".
97
+ - Do NOT name any specific project, technology, company, or skill UNLESS it appears in the
98
+ SUGGESTION line provided to you. You have NO retrieved facts.
99
+ - No apologies, no padding, vary your phrasing.
100
+ """
101
+
102
+ # Enumeration path: Groq formats the pre-fetched, deduplicated title list.
103
+ # The generate node builds a numbered list in the prompt; Groq adds citations.
104
+ _ENUM_SYSTEM_PROMPT = """\
105
+ You are the assistant on Darshan Chheda's portfolio website.
106
+ You have been given a complete, database-fetched list of items matching the visitor's request.
107
+ Your job is to format this list as a clean numbered list and add one citation per item.
108
 
109
+ FORMATTING RULES:
110
+ 1. Output a numbered list. Each line: "N. [Title](URL) one-sentence description from the passage."
111
+ 2. Cite each item with [N] immediately after its title. Example: "1. TextOps [1] — ..."
112
+ 3. Only use the titles, URLs, and text provided in the passages. Do not invent items.
113
+ 4. If a URL is missing for an item, omit the link but keep the title.
114
+ 5. Do not add a preamble like "Here is a list of..." — start directly with "1.".
115
+ 6. After the list, add one sentence summarising the count: "That's N items in total."
116
+ 7. No apologies, no padding.
117
+ """
118
 
119
 
120
  def _format_history(state: "PipelineState") -> str:
 
156
  complexity = state.get("query_complexity", "simple")
157
  reranked_chunks = state.get("reranked_chunks", [])
158
 
159
+ # ── Enumeration path (Fix 1) ──────────────────────────────────────────────
160
+ # enumerate_query node already set is_enumeration_query=True and populated
161
+ # reranked_chunks with deduplicated, alphabetically-sorted title chunks.
162
+ # We format the pre-fetched list with a special prompt — no extra LLM reasoning
163
+ # needed, just reliable numbered-list formatting with one citation per item.
164
+ if state.get("is_enumeration_query") and reranked_chunks:
165
+ writer({"type": "status", "label": "Formatting complete list..."})
166
+ context_parts: list[str] = []
167
+ source_refs: list[SourceRef] = []
168
+ for i, chunk in enumerate(reranked_chunks, start=1):
169
+ meta = chunk["metadata"]
170
+ header = f"[{i}] {meta.get('source_title', 'Item')}"
171
+ if meta.get("source_url"):
172
+ header += f" ({meta['source_url']})"
173
+ context_parts.append(f"{header}\n{chunk['text'][:300]}")
174
+ source_refs.append(
175
+ SourceRef(
176
+ title=meta.get("source_title", ""),
177
+ url=meta.get("source_url", ""),
178
+ section=meta.get("section", ""),
179
+ )
180
+ )
181
+ context_block_enum = "\n\n".join(context_parts)
182
+ prompt_enum = f"Items fetched from database:\n{context_block_enum}\n\nVisitor request: {query}"
183
+ stream = llm_client.complete_with_complexity(
184
+ prompt=prompt_enum,
185
+ system=_ENUM_SYSTEM_PROMPT,
186
+ stream=True,
187
+ complexity="simple",
188
+ )
189
+ full_answer = ""
190
+ async for token in stream:
191
+ full_answer += token
192
+ writer({"type": "token", "text": token})
193
+ return {"answer": full_answer, "sources": source_refs, "path": "enumeration"}
194
+
195
+ # ── Not-found path ────────────────────────────────────────────────────────────
196
  if not reranked_chunks:
197
  writer({"type": "status", "label": "Could not find specific information, responding carefully..."})
198
+
199
+ # Fix 2 Rule 2: generate a specific, topical redirect suggestion using
200
+ # Gemini with the TOON portfolio entity list. Fires here (after all CRAG
201
+ # retries have been exhausted) so the visitor always gets a meaningful
202
+ # alternative rather than a generic catch-all footer.
203
+ query_topic = state.get("query_topic") or "that topic"
204
+ specific_suggestion = (
205
+ f"Try rephrasing about {query_topic} — I may know it under a different term."
206
+ )
207
+ if gemini_client is not None and gemini_client.is_configured:
208
+ try:
209
+ specific_suggestion = await gemini_client.generate_specific_suggestion(
210
+ query=query,
211
+ query_topic=query_topic,
212
+ suggestion_hint=SUGGESTION_HINT,
213
+ )
214
+ except Exception as exc:
215
+ logger.debug("Specific suggestion generation failed: %s", exc)
216
+
217
  history_prefix = _format_history(state)
218
+ prompt_not_found = (
219
+ f"{history_prefix}Visitor question: {query}\n\n"
220
+ f"SUGGESTION: {specific_suggestion}"
221
+ )
222
  stream = llm_client.complete_with_complexity(
223
+ prompt=prompt_not_found,
224
  system=_NOT_FOUND_SYSTEM,
225
  stream=True,
226
  complexity="simple",
app/pipeline/nodes/log_eval.py CHANGED
@@ -65,7 +65,8 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
65
  critic_groundedness INTEGER,
66
  critic_completeness INTEGER,
67
  critic_specificity INTEGER,
68
- critic_quality TEXT
 
69
  )
70
  """
71
  )
@@ -81,6 +82,8 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
81
  ("critic_completeness", "INTEGER"),
82
  ("critic_specificity", "INTEGER"),
83
  ("critic_quality", "TEXT"),
 
 
84
  ]:
85
  try:
86
  conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
@@ -92,8 +95,9 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
92
  INSERT INTO interactions
93
  (timestamp, session_id, query, answer, chunks_used, rerank_scores,
94
  reranked_chunks_json, latency_ms, cached, path,
95
- critic_groundedness, critic_completeness, critic_specificity, critic_quality)
96
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
 
97
  """,
98
  (
99
  datetime.now(tz=timezone.utc).isoformat(),
@@ -110,6 +114,7 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
110
  state.get("critic_completeness"),
111
  state.get("critic_specificity"),
112
  state.get("critic_quality"),
 
113
  ),
114
  )
115
  return cursor.lastrowid # type: ignore[return-value]
@@ -145,6 +150,7 @@ def make_log_eval_node(db_path: str, github_log=None) -> Callable[[PipelineState
145
  "critic_completeness": state.get("critic_completeness"),
146
  "critic_specificity": state.get("critic_specificity"),
147
  "critic_quality": state.get("critic_quality"),
 
148
  }
149
  github_log.append(record)
150
 
 
65
  critic_groundedness INTEGER,
66
  critic_completeness INTEGER,
67
  critic_specificity INTEGER,
68
+ critic_quality TEXT,
69
+ is_enumeration_query BOOLEAN DEFAULT 0
70
  )
71
  """
72
  )
 
82
  ("critic_completeness", "INTEGER"),
83
  ("critic_specificity", "INTEGER"),
84
  ("critic_quality", "TEXT"),
85
+ # Fix 1: enumeration classifier flag
86
+ ("is_enumeration_query", "BOOLEAN DEFAULT 0"),
87
  ]:
88
  try:
89
  conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
 
95
  INSERT INTO interactions
96
  (timestamp, session_id, query, answer, chunks_used, rerank_scores,
97
  reranked_chunks_json, latency_ms, cached, path,
98
+ critic_groundedness, critic_completeness, critic_specificity, critic_quality,
99
+ is_enumeration_query)
100
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
101
  """,
102
  (
103
  datetime.now(tz=timezone.utc).isoformat(),
 
114
  state.get("critic_completeness"),
115
  state.get("critic_specificity"),
116
  state.get("critic_quality"),
117
+ state.get("is_enumeration_query", False),
118
  ),
119
  )
120
  return cursor.lastrowid # type: ignore[return-value]
 
150
  "critic_completeness": state.get("critic_completeness"),
151
  "critic_specificity": state.get("critic_specificity"),
152
  "critic_quality": state.get("critic_quality"),
153
+ "is_enumeration_query": state.get("is_enumeration_query", False),
154
  }
155
  github_log.append(record)
156
 
app/services/gemini_client.py CHANGED
@@ -447,3 +447,61 @@ class GeminiClient:
447
  # Non-fatal: log and fall back to RAG so users always get a response.
448
  logger.warning("Gemini fast path error (%s); routing to RAG.", exc)
449
  return None, query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  # Non-fatal: log and fall back to RAG so users always get a response.
448
  logger.warning("Gemini fast path error (%s); routing to RAG.", exc)
449
  return None, query
450
+
451
+ async def generate_specific_suggestion(
452
+ self,
453
+ query: str,
454
+ query_topic: str,
455
+ suggestion_hint: str,
456
+ ) -> str:
457
+ """
458
+ Fix 2 Rule 2 — generate a specific not-found redirect suggestion.
459
+
460
+ When the RAG pipeline finds nothing (after CRAG retry), instead of
461
+ the generic "ask about his projects", this method uses the TOON portfolio
462
+ context to produce a specific, topical suggestion grounded in real content.
463
+
464
+ Examples:
465
+ query_topic="kubernetes" →
466
+ "Ask about how Darshan deployed TextOps on Kubernetes with custom Helm charts."
467
+ query_topic="work experience" →
468
+ "Try asking about his role at VK Live or his responsibilities there."
469
+
470
+ Falls back to a topic-specific hardcoded suggestion if Gemini is unavailable.
471
+ The fallback itself uses ``query_topic`` so it is always more specific than
472
+ the generic "ask about his projects" footer.
473
+ """
474
+ if not self._client:
475
+ # Graceful fallback: still more specific than the old generic text.
476
+ return (
477
+ f"Try rephrasing your question about {query_topic} "
478
+ "— I may know it under a different term."
479
+ )
480
+
481
+ prompt = (
482
+ f"Portfolio content available:\n{suggestion_hint}\n\n"
483
+ f"Visitor asked: {query}\n"
484
+ f"Topic detected: {query_topic}\n\n"
485
+ "The search returned no results. Write ONE specific suggestion the visitor "
486
+ "should try instead, referencing a real item from the portfolio content above "
487
+ "that is most related to their query topic. "
488
+ "Format: 'Try asking about [specific item/aspect].' "
489
+ "Maximum 20 words. Output ONLY the suggestion sentence."
490
+ )
491
+ try:
492
+ from google.genai import types # noqa: PLC0415
493
+ response = await self._client.aio.models.generate_content( # type: ignore[attr-defined]
494
+ model=self._model,
495
+ contents=prompt,
496
+ config=types.GenerateContentConfig(temperature=0.3, max_output_tokens=60),
497
+ )
498
+ text = (response.candidates[0].content.parts[0].text or "").strip().strip('"')
499
+ if text:
500
+ logger.debug("Specific suggestion generated: %r", text[:80])
501
+ return text
502
+ except Exception as exc:
503
+ logger.warning("generate_specific_suggestion failed (%s); using fallback.", exc)
504
+ return (
505
+ f"Try rephrasing your question about {query_topic} "
506
+ "— I may know it under a different term."
507
+ )
app/services/vector_store.py CHANGED
@@ -274,3 +274,46 @@ class VectorStore:
274
  "search_by_raptor_level(level=%d) failed: %s — skipping RAPTOR results.", level, exc
275
  )
276
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  "search_by_raptor_level(level=%d) failed: %s — skipping RAPTOR results.", level, exc
275
  )
276
  return []
277
+
278
+ def scroll_by_source_type(
279
+ self,
280
+ source_types: list[str],
281
+ limit: int = 500,
282
+ ) -> list[Chunk]:
283
+ """
284
+ Retrieve all chunks matching any of the given source_types via payload
285
+ filter — no vector search involved.
286
+
287
+ Used by the enumeration_query node (Fix 1) to answer "list all projects /
288
+ blogs / skills" queries with zero embedding or reranker calls. The result
289
+ is deduplicated and sorted by the caller.
290
+
291
+ source_types: list of metadata.source_type values to include.
292
+ e.g. ["project"] or ["blog"] or ["cv", "project", "blog"]
293
+ limit: upper bound on total points fetched (safety cap; default 500 covers
294
+ any realistic personal portfolio without unbounded scrolling).
295
+ """
296
+ if not source_types:
297
+ return []
298
+ try:
299
+ # OR filter across all requested source types.
300
+ should_conditions = [
301
+ FieldCondition(
302
+ key="metadata.source_type",
303
+ match=MatchValue(value=st),
304
+ )
305
+ for st in source_types
306
+ ]
307
+ qdrant_filter = Filter(should=should_conditions)
308
+
309
+ records, _ = self.client.scroll(
310
+ collection_name=self.collection,
311
+ scroll_filter=qdrant_filter,
312
+ limit=limit,
313
+ with_payload=True,
314
+ with_vectors=False,
315
+ )
316
+ return [Chunk(**rec.payload) for rec in records if rec.payload]
317
+ except Exception as exc:
318
+ logger.warning("scroll_by_source_type(%r) failed: %s", source_types, exc)
319
+ return []
tests/conftest.py CHANGED
@@ -62,6 +62,9 @@ def app_client():
62
  if isinstance(stream_mode, list):
63
  yield ("custom", {"type": "status", "label": "Checking your question"})
64
  yield ("updates", {"guard": {"guard_passed": True}})
 
 
 
65
  yield ("updates", {"cache": {"cached": False}})
66
  yield ("custom", {"type": "status", "label": "Thinking about your question directly..."})
67
  yield ("custom", {"type": "token", "text": "I built TextOps."})
@@ -69,6 +72,7 @@ def app_client():
69
  else:
70
  # Fallback for any code that still calls astream without stream_mode.
71
  yield {"guard": {"guard_passed": True}}
 
72
  yield {"cache": {"cached": False}}
73
  yield {"generate": {"answer": "I built TextOps.", "sources": []}}
74
 
 
62
  if isinstance(stream_mode, list):
63
  yield ("custom", {"type": "status", "label": "Checking your question"})
64
  yield ("updates", {"guard": {"guard_passed": True}})
65
+ # Fix 1: enumerate_query node runs after guard on every request.
66
+ # Non-enumeration queries set is_enumeration_query=False and pass through.
67
+ yield ("updates", {"enumerate_query": {"is_enumeration_query": False}})
68
  yield ("updates", {"cache": {"cached": False}})
69
  yield ("custom", {"type": "status", "label": "Thinking about your question directly..."})
70
  yield ("custom", {"type": "token", "text": "I built TextOps."})
 
72
  else:
73
  # Fallback for any code that still calls astream without stream_mode.
74
  yield {"guard": {"guard_passed": True}}
75
+ yield {"enumerate_query": {"is_enumeration_query": False}}
76
  yield {"cache": {"cached": False}}
77
  yield {"generate": {"answer": "I built TextOps.", "sources": []}}
78
 
tests/test_enumerate_query.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # backend/tests/test_enumerate_query.py
2
+ # Unit tests for the enumeration query classifier (Fix 1) and
3
+ # the portfolio-relevance helper (Fix 2 Rule 1).
4
+ #
5
+ # All tests are pure-Python; no network calls, no Qdrant, no embedder.
6
+
7
+ import pytest
8
+ from unittest.mock import AsyncMock, MagicMock, patch
9
+
10
+ from app.pipeline.nodes.enumerate_query import (
11
+ _has_enumeration_intent,
12
+ _extract_source_types,
13
+ make_enumerate_query_node,
14
+ )
15
+ from app.core.portfolio_context import is_portfolio_relevant
16
+
17
+ # Patch target for LangGraph's stream writer, which requires a runnable context
18
+ # that doesn't exist in unit tests.
19
+ _WRITER_PATCH = "app.pipeline.nodes.enumerate_query.get_stream_writer"
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # _has_enumeration_intent
24
+ # ---------------------------------------------------------------------------
25
+
26
+
27
+ class TestHasEnumerationIntent:
28
+ def test_list_all_projects(self):
29
+ assert _has_enumeration_intent("list all projects") is True
30
+
31
+ def test_list_projects_no_all(self):
32
+ assert _has_enumeration_intent("list projects") is True
33
+
34
+ def test_show_all_blogs(self):
35
+ assert _has_enumeration_intent("show all blog posts") is True
36
+
37
+ def test_how_many_blogs(self):
38
+ assert _has_enumeration_intent("how many blog posts do you have") is True
39
+
40
+ def test_count_projects(self):
41
+ assert _has_enumeration_intent("count projects") is True
42
+
43
+ def test_enumerate_skills(self):
44
+ assert _has_enumeration_intent("enumerate all skills") is True
45
+
46
+ def test_give_me_a_list_of(self):
47
+ assert _has_enumeration_intent("give me a list of your projects") is True
48
+
49
+ def test_what_are_all_the_projects(self):
50
+ # trailing-regex pattern: "what are all the X"
51
+ assert _has_enumeration_intent("what are all the projects") is True
52
+
53
+ def test_which_are_all_the_blogs(self):
54
+ # Requires "all" keyword — the trailing regex gate prevents over-triggering.
55
+ assert _has_enumeration_intent("which are all the blog posts") is True
56
+
57
+ def test_regular_how_query_no_intent(self):
58
+ assert _has_enumeration_intent("how does TextOps work") is False
59
+
60
+ def test_explain_query_no_intent(self):
61
+ assert _has_enumeration_intent("explain the architecture of PersonaBot") is False
62
+
63
+ def test_what_is_query_no_intent(self):
64
+ assert _has_enumeration_intent("what is echo-echo") is False
65
+
66
+ def test_tell_me_about_no_intent(self):
67
+ assert _has_enumeration_intent("tell me about your background") is False
68
+
69
+ def test_empty_string(self):
70
+ assert _has_enumeration_intent("") is False
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # _extract_source_types
75
+ # ---------------------------------------------------------------------------
76
+
77
+
78
+ class TestExtractSourceTypes:
79
+ def test_projects(self):
80
+ types = _extract_source_types("list all projects")
81
+ assert "project" in types
82
+
83
+ def test_blogs(self):
84
+ types = _extract_source_types("show all blog posts")
85
+ assert "blog" in types
86
+
87
+ def test_skills_cv(self):
88
+ types = _extract_source_types("list all your skills")
89
+ assert "cv" in types
90
+
91
+ def test_generic_returns_empty(self):
92
+ # "everything" or "all" without a type token → [] meaning scroll all types
93
+ types = _extract_source_types("list everything")
94
+ assert types == []
95
+
96
+ def test_github_repos(self):
97
+ types = _extract_source_types("show all github repos")
98
+ assert "github" in types
99
+
100
+ def test_work_experience(self):
101
+ types = _extract_source_types("list all work experience")
102
+ assert "cv" in types
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # make_enumerate_query_node
107
+ # ---------------------------------------------------------------------------
108
+
109
+
110
+ @pytest.mark.asyncio
111
+ async def test_non_enumeration_query_passes_through():
112
+ """A regular query must exit the node with is_enumeration_query=False."""
113
+ mock_vs = MagicMock()
114
+ mock_vs.scroll_by_source_type = MagicMock(return_value=[])
115
+
116
+ node = make_enumerate_query_node(mock_vs)
117
+ state = {"query": "how does TextOps work", "retrieval_attempts": 0}
118
+ with patch(_WRITER_PATCH, return_value=MagicMock()):
119
+ result = node(state)
120
+
121
+ assert result["is_enumeration_query"] is False
122
+ # Vector store must NOT be called for normal queries (zero cost guarantee).
123
+ mock_vs.scroll_by_source_type.assert_not_called()
124
+
125
+
126
+ @pytest.mark.asyncio
127
+ async def test_enumeration_query_sets_flag_and_populates_chunks():
128
+ """An enumeration query must call scroll and set is_enumeration_query=True."""
129
+ chunk_a = {
130
+ "text": "TextOps is a CLI toolkit.",
131
+ "metadata": {"source_title": "TextOps", "source_type": "project", "doc_id": "textops-1"},
132
+ }
133
+ chunk_b = {
134
+ "text": "Echo-Echo is a WebRTC demo.",
135
+ "metadata": {"source_title": "Echo-Echo", "source_type": "project", "doc_id": "echo-1"},
136
+ }
137
+ mock_vs = MagicMock()
138
+ mock_vs.scroll_by_source_type = MagicMock(return_value=[chunk_a, chunk_b])
139
+
140
+ node = make_enumerate_query_node(mock_vs)
141
+ state = {"query": "list all projects", "retrieval_attempts": 0}
142
+ with patch(_WRITER_PATCH, return_value=MagicMock()):
143
+ result = node(state)
144
+
145
+ assert result["is_enumeration_query"] is True
146
+ assert len(result["reranked_chunks"]) == 2
147
+ mock_vs.scroll_by_source_type.assert_called_once()
148
+
149
+
150
+ @pytest.mark.asyncio
151
+ async def test_enumeration_deduplicates_by_source_title():
152
+ """Duplicate source_title chunks must be collapsed to one representative."""
153
+ chunk_a = {
154
+ "text": "TextOps chunk 1",
155
+ "metadata": {"source_title": "TextOps", "source_type": "project", "doc_id": "textops-1"},
156
+ }
157
+ chunk_b = {
158
+ "text": "TextOps chunk 2",
159
+ "metadata": {"source_title": "TextOps", "source_type": "project", "doc_id": "textops-2"},
160
+ }
161
+ mock_vs = MagicMock()
162
+ mock_vs.scroll_by_source_type = MagicMock(return_value=[chunk_a, chunk_b])
163
+
164
+ node = make_enumerate_query_node(mock_vs)
165
+ state = {"query": "list all projects", "retrieval_attempts": 0}
166
+ with patch(_WRITER_PATCH, return_value=MagicMock()):
167
+ result = node(state)
168
+
169
+ assert result["is_enumeration_query"] is True
170
+ assert len(result["reranked_chunks"]) == 1
171
+
172
+
173
+ @pytest.mark.asyncio
174
+ async def test_enumeration_empty_scroll_returns_not_found():
175
+ """When Qdrant returns no chunks, is_enumeration_query stays False (no results to list)."""
176
+ mock_vs = MagicMock()
177
+ mock_vs.scroll_by_source_type = MagicMock(return_value=[])
178
+
179
+ node = make_enumerate_query_node(mock_vs)
180
+ state = {"query": "list all projects", "retrieval_attempts": 0}
181
+ with patch(_WRITER_PATCH, return_value=MagicMock()):
182
+ result = node(state)
183
+
184
+ # With no chunks, the node does not commit to enumeration path; falls to RAG.
185
+ assert result["is_enumeration_query"] is False
186
+
187
+
188
+ # ---------------------------------------------------------------------------
189
+ # is_portfolio_relevant (Fix 2 Rule 1)
190
+ # ---------------------------------------------------------------------------
191
+
192
+
193
+ class TestIsPortfolioRelevant:
194
+ def test_known_project_name(self):
195
+ assert is_portfolio_relevant("how does textops work") is True
196
+
197
+ def test_known_project_variant(self):
198
+ assert is_portfolio_relevant("tell me about echo echo") is True
199
+
200
+ def test_known_technology(self):
201
+ assert is_portfolio_relevant("explain the use of langchain in your stack") is True
202
+
203
+ def test_known_organisation(self):
204
+ assert is_portfolio_relevant("what did you do at vk live") is True
205
+
206
+ def test_unrelated_query(self):
207
+ assert is_portfolio_relevant("what is the weather in london") is False
208
+
209
+ def test_generic_question(self):
210
+ assert is_portfolio_relevant("tell me a joke") is False
211
+
212
+ def test_empty_string(self):
213
+ assert is_portfolio_relevant("") is False