GitHub Actions commited on
Commit
9563e4a
·
1 Parent(s): 1d47e3c

Deploy 1ba9ba6

Browse files
app/api/chat.py CHANGED
@@ -12,6 +12,14 @@ from app.security.jwt_auth import verify_jwt
12
 
13
  router = APIRouter()
14
 
 
 
 
 
 
 
 
 
15
  # Phrases a visitor uses when telling the bot it gave a wrong answer.
16
  # Matched on the lowercased raw message before any LLM call — O(1), zero cost.
17
  _CRITICISM_SIGNALS: frozenset[str] = frozenset({
@@ -72,7 +80,6 @@ async def _generate_follow_ups(
72
  for s in sources[:4]:
73
  title = s.title if hasattr(s, "title") else s.get("title", "")
74
  src_type = s.source_type if hasattr(s, "source_type") else s.get("source_type", "")
75
- url = s.url if hasattr(s, "url") else s.get("url", "")
76
  if title:
77
  source_info.append(f"{title} ({src_type})" if src_type else title)
78
 
@@ -196,7 +203,7 @@ async def chat_endpoint(
196
  # will use it if present; Guard runs first so the latency is masked).
197
  if decontext_task is not None:
198
  try:
199
- result = await asyncio.wait_for(decontext_task, timeout=3.0)
200
  if result and result.strip().lower() != request_data.message.strip().lower():
201
  decontextualized_query = result.strip()
202
  except Exception:
@@ -206,7 +213,7 @@ async def chat_endpoint(
206
  expansion_result: dict | None = None
207
  if expansion_task is not None:
208
  try:
209
- expansion_result = await asyncio.wait_for(expansion_task, timeout=0.8)
210
  except Exception:
211
  pass # Expansion is best-effort; retriever falls back to raw query.
212
 
@@ -260,14 +267,39 @@ async def chat_endpoint(
260
  interaction_id = None
261
 
262
  try:
 
 
 
263
  # stream_mode=["custom", "updates"] yields (mode, data) tuples:
264
  # mode="custom" → data is whatever writer(payload) was called with
265
  # mode="updates" → data is {node_name: state_updates_dict}
266
- async for mode, data in pipeline.astream(
267
  initial_state,
268
  stream_mode=["custom", "updates"],
269
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  if await request.is_disconnected():
 
 
271
  break
272
 
273
  if mode == "custom":
@@ -342,5 +374,9 @@ async def chat_endpoint(
342
  return StreamingResponse(
343
  sse_generator(),
344
  media_type="text/event-stream",
345
- headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
 
 
 
 
346
  )
 
12
 
13
  router = APIRouter()
14
 
15
+ # Keep-alive interval for SSE when upstream nodes are still working.
16
+ # Prevents edge/proxy idle timeouts on long retrieval/generation turns.
17
+ _SSE_HEARTBEAT_SECONDS: float = 10.0
18
+
19
+ # Query pre-processing budgets must stay low to avoid delaying first byte.
20
+ _DECONTEXT_TIMEOUT_SECONDS: float = 0.35
21
+ _EXPANSION_TIMEOUT_SECONDS: float = 0.25
22
+
23
  # Phrases a visitor uses when telling the bot it gave a wrong answer.
24
  # Matched on the lowercased raw message before any LLM call — O(1), zero cost.
25
  _CRITICISM_SIGNALS: frozenset[str] = frozenset({
 
80
  for s in sources[:4]:
81
  title = s.title if hasattr(s, "title") else s.get("title", "")
82
  src_type = s.source_type if hasattr(s, "source_type") else s.get("source_type", "")
 
83
  if title:
84
  source_info.append(f"{title} ({src_type})" if src_type else title)
85
 
 
203
  # will use it if present; Guard runs first so the latency is masked).
204
  if decontext_task is not None:
205
  try:
206
+ result = await asyncio.wait_for(decontext_task, timeout=_DECONTEXT_TIMEOUT_SECONDS)
207
  if result and result.strip().lower() != request_data.message.strip().lower():
208
  decontextualized_query = result.strip()
209
  except Exception:
 
213
  expansion_result: dict | None = None
214
  if expansion_task is not None:
215
  try:
216
+ expansion_result = await asyncio.wait_for(expansion_task, timeout=_EXPANSION_TIMEOUT_SECONDS)
217
  except Exception:
218
  pass # Expansion is best-effort; retriever falls back to raw query.
219
 
 
267
  interaction_id = None
268
 
269
  try:
270
+ # Emit an early event so clients/proxies receive first bytes quickly.
271
+ yield f"event: status\ndata: {json.dumps({'label': 'Starting response...'})}\n\n"
272
+
273
  # stream_mode=["custom", "updates"] yields (mode, data) tuples:
274
  # mode="custom" → data is whatever writer(payload) was called with
275
  # mode="updates" → data is {node_name: state_updates_dict}
276
+ stream_iter = pipeline.astream(
277
  initial_state,
278
  stream_mode=["custom", "updates"],
279
+ ).__aiter__()
280
+ next_item_task: asyncio.Task | None = asyncio.create_task(stream_iter.__anext__())
281
+
282
+ while True:
283
+ try:
284
+ mode, data = await asyncio.wait_for(
285
+ asyncio.shield(next_item_task),
286
+ timeout=_SSE_HEARTBEAT_SECONDS,
287
+ )
288
+ except asyncio.TimeoutError:
289
+ if await request.is_disconnected():
290
+ if not next_item_task.done():
291
+ next_item_task.cancel()
292
+ break
293
+ yield f"event: ping\ndata: {json.dumps({'ts': int(time.time())})}\n\n"
294
+ continue
295
+ except StopAsyncIteration:
296
+ break
297
+
298
+ next_item_task = asyncio.create_task(stream_iter.__anext__())
299
+
300
  if await request.is_disconnected():
301
+ if not next_item_task.done():
302
+ next_item_task.cancel()
303
  break
304
 
305
  if mode == "custom":
 
374
  return StreamingResponse(
375
  sse_generator(),
376
  media_type="text/event-stream",
377
+ headers={
378
+ "Cache-Control": "no-cache",
379
+ "X-Accel-Buffering": "no",
380
+ "Connection": "keep-alive",
381
+ },
382
  )
app/core/quality.py CHANGED
@@ -29,8 +29,13 @@ _HEDGE_PHRASES: tuple[str, ...] = (
29
  "does not provide",
30
  "does not offer",
31
  "no detailed information",
 
 
 
32
  )
33
 
 
 
34
 
35
  def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
36
  """
@@ -46,6 +51,8 @@ def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
46
  lowered = answer.lower()
47
  if any(phrase in lowered for phrase in _HEDGE_PHRASES):
48
  return True
 
 
49
  if chunks and not re.search(r"\[\d+\]", answer):
50
  return True
51
  if complexity == "complex" and len(answer.split()) < 30:
 
29
  "does not provide",
30
  "does not offer",
31
  "no detailed information",
32
+ "not explicitly state",
33
+ "not explicitly stated",
34
+ "cannot be verified",
35
  )
36
 
37
+ _RAW_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
38
+
39
 
40
  def is_low_trust(answer: str, chunks: list, complexity: str) -> bool:
41
  """
 
51
  lowered = answer.lower()
52
  if any(phrase in lowered for phrase in _HEDGE_PHRASES):
53
  return True
54
+ if _RAW_TAG_RE.search(answer):
55
+ return True
56
  if chunks and not re.search(r"\[\d+\]", answer):
57
  return True
58
  if complexity == "complex" and len(answer.split()) < 30:
app/pipeline/nodes/generate.py CHANGED
@@ -17,6 +17,9 @@ logger = logging.getLogger(__name__)
17
  _THINK_COMPLETE_RE = re.compile(r"<think>[\s\S]*?</think>", re.DOTALL)
18
  _THINK_OPEN_RE = re.compile(r"<think>")
19
  _THINK_CLOSE_RE = re.compile(r"</think>")
 
 
 
20
 
21
  # Chars to buffer at Phase-2 chunk boundaries to prevent split closing tags
22
  # (e.g., one SSE chunk ends with "</thi", next starts with "nk>") from being
@@ -73,6 +76,10 @@ ANSWERING RULES — follow all of them every time:
73
  those facts — a short confident answer beats a padded hallucinated one.
74
  7. Vary your sentence openers. Never start two consecutive sentences with "Darshan".
75
  8. Length: 2–4 paragraphs for detailed topics; 1 paragraph for simple factual questions.
 
 
 
 
76
 
77
  RELEVANCE CHECK — do this BEFORE writing:
78
  - Examine EVERY passage, not just the first one. The most relevant passage may not be [1].
@@ -234,13 +241,67 @@ def _normalise_answer_text(answer: str, max_citation_index: int) -> str:
234
  idx = int(match.group(1))
235
  return f"[{idx}]" if 1 <= idx <= max_citation_index else ""
236
 
237
- cleaned = re.sub(r"\[(\d+)\]", _keep_valid_citation, answer)
 
238
  cleaned = re.sub(r"(\[\d+\])(\1)+", r"\1", cleaned)
239
  cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
240
  cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
241
  return cleaned.strip()
242
 
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]: # noqa: ANN001
245
  # Number of token chunks to buffer before deciding there is no CoT block.
246
  # Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
@@ -310,7 +371,8 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
310
  # TextOps become [1] and [2] — the LLM cites both in the same sentence,
311
  # which looks like self-citing hallucination even though it is technically
312
  # correct. _merge_by_source preserves all text; nothing is discarded.
313
- merged_chunks = _merge_by_source(reranked_chunks)
 
314
  context_parts: list[str] = []
315
  source_refs: list[SourceRef] = []
316
 
@@ -457,6 +519,12 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
457
 
458
  full_answer = _normalise_answer_text(full_answer, max_citation_index=len(source_refs))
459
 
 
 
 
 
 
 
460
  # Only surface sources the LLM actually cited, deduplicated by URL so
461
  # multiple chunks from the same document show as one source card.
462
  cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}
 
17
  _THINK_COMPLETE_RE = re.compile(r"<think>[\s\S]*?</think>", re.DOTALL)
18
  _THINK_OPEN_RE = re.compile(r"<think>")
19
  _THINK_CLOSE_RE = re.compile(r"</think>")
20
+ _GEN_HTML_TAG_RE = re.compile(r"</?[a-zA-Z][^>]*>")
21
+ _VERSION_PARITY_RE = re.compile(r"\b(up[- ]?to[- ]?date|latest|current|in sync|same version|version)\b", re.IGNORECASE)
22
+ _WORD_RE = re.compile(r"[a-zA-Z0-9]+")
23
 
24
  # Chars to buffer at Phase-2 chunk boundaries to prevent split closing tags
25
  # (e.g., one SSE chunk ends with "</thi", next starts with "nk>") from being
 
76
  those facts — a short confident answer beats a padded hallucinated one.
77
  7. Vary your sentence openers. Never start two consecutive sentences with "Darshan".
78
  8. Length: 2–4 paragraphs for detailed topics; 1 paragraph for simple factual questions.
79
+ 9. If asked about freshness/version parity (e.g., "up-to-date", "same as demo"), and passages
80
+ do not explicitly confirm it, answer in at most 2 sentences: state what is known from passages,
81
+ then explicitly say it cannot be verified from indexed sources.
82
+ 10. Do not list unrelated projects or sources unless the user asked for a list/compare.
83
 
84
  RELEVANCE CHECK — do this BEFORE writing:
85
  - Examine EVERY passage, not just the first one. The most relevant passage may not be [1].
 
241
  idx = int(match.group(1))
242
  return f"[{idx}]" if 1 <= idx <= max_citation_index else ""
243
 
244
+ cleaned = _GEN_HTML_TAG_RE.sub("", answer)
245
+ cleaned = re.sub(r"\[(\d+)\]", _keep_valid_citation, cleaned)
246
  cleaned = re.sub(r"(\[\d+\])(\1)+", r"\1", cleaned)
247
  cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
248
  cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
249
  return cleaned.strip()
250
 
251
 
252
+ def _build_low_trust_fallback(query: str, source_refs: list[SourceRef]) -> str:
253
+ """Deterministic concise fallback when model output still fails trust checks."""
254
+ if not source_refs:
255
+ return _NOT_FOUND_ANSWER
256
+
257
+ first = source_refs[0]
258
+ title = first.title or "the retrieved source"
259
+
260
+ if _VERSION_PARITY_RE.search(query):
261
+ return (
262
+ f"The retrieved sources confirm links/details for {title} [1], but they do not explicitly "
263
+ "confirm whether the GitHub code and live demo are currently in sync, so version parity "
264
+ "cannot be verified from the indexed content alone [1]."
265
+ )
266
+
267
+ return (
268
+ f"Based on the retrieved evidence, the answer is grounded in {title} [1]. "
269
+ "If you want deeper detail, ask for a specific section, implementation part, or comparison."
270
+ )
271
+
272
+
273
+ def _select_chunks_for_prompt(query: str, reranked_chunks: list[dict]) -> list[dict]:
274
+ """
275
+ Prefer chunks whose source title is explicitly referenced in the query.
276
+
277
+ This prevents focused questions (e.g. one project) from receiving multi-project
278
+ blended context that can trigger verbose, low-quality comparison answers.
279
+ """
280
+ if not reranked_chunks:
281
+ return reranked_chunks
282
+
283
+ query_lower = query.lower()
284
+ focused: list[dict] = []
285
+
286
+ for chunk in reranked_chunks:
287
+ title = str(chunk["metadata"].get("source_title", "")).strip()
288
+ if not title:
289
+ continue
290
+ title_lower = title.lower()
291
+ if len(title_lower) >= 4 and title_lower in query_lower:
292
+ focused.append(chunk)
293
+ continue
294
+
295
+ title_tokens = [t for t in _WORD_RE.findall(title_lower) if len(t) >= 4]
296
+ if title_tokens and sum(1 for tok in title_tokens if tok in query_lower) >= min(2, len(title_tokens)):
297
+ focused.append(chunk)
298
+
299
+ if focused:
300
+ return focused[:6]
301
+
302
+ return reranked_chunks[:8]
303
+
304
+
305
  def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]: # noqa: ANN001
306
  # Number of token chunks to buffer before deciding there is no CoT block.
307
  # Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
 
371
  # TextOps become [1] and [2] — the LLM cites both in the same sentence,
372
  # which looks like self-citing hallucination even though it is technically
373
  # correct. _merge_by_source preserves all text; nothing is discarded.
374
+ selected_chunks = _select_chunks_for_prompt(query, reranked_chunks)
375
+ merged_chunks = _merge_by_source(selected_chunks)
376
  context_parts: list[str] = []
377
  source_refs: list[SourceRef] = []
378
 
 
519
 
520
  full_answer = _normalise_answer_text(full_answer, max_citation_index=len(source_refs))
521
 
522
+ # Final guardrail: if answer still looks low-trust after reformat + cleanup,
523
+ # return a concise deterministic fallback anchored to retrieved sources.
524
+ if is_low_trust(full_answer, reranked_chunks, complexity):
525
+ logger.debug("Final low-trust guard triggered; using deterministic fallback.")
526
+ full_answer = _build_low_trust_fallback(query, source_refs)
527
+
528
  # Only surface sources the LLM actually cited, deduplicated by URL so
529
  # multiple chunks from the same document show as one source card.
530
  cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}
tests/test_chat_stream_reliability.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import time
4
+ from unittest.mock import MagicMock, patch
5
+
6
+ from fastapi.testclient import TestClient
7
+ from jose import jwt
8
+
9
+
10
+ def _make_token() -> str:
11
+ payload = {"sub": "test-user", "exp": int(time.time()) + 3600}
12
+ return jwt.encode(payload, "test-secret-32-chars-long-0000000", algorithm="HS256")
13
+
14
+
15
+ def _chat(client: TestClient, message: str) -> str:
16
+ token = _make_token()
17
+ response = client.post(
18
+ "/chat",
19
+ json={"message": message, "session_id": "a1b2c3d4-e5f6-4789-8abc-def012345678"},
20
+ headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"},
21
+ )
22
+ assert response.status_code == 200
23
+ return response.text
24
+
25
+
26
+ def test_stream_emits_early_status_and_heartbeat(monkeypatch):
27
+ # Reduce heartbeat interval so the test can verify keepalive quickly.
28
+ monkeypatch.setattr("app.api.chat._SSE_HEARTBEAT_SECONDS", 0.05)
29
+
30
+ mock_pipeline = MagicMock()
31
+
32
+ async def delayed_astream(state, stream_mode=None):
33
+ await asyncio.sleep(0.12)
34
+ yield ("custom", {"type": "status", "label": "Thinking..."})
35
+ yield ("custom", {"type": "token", "text": "Answer text."})
36
+ yield ("updates", {"generate": {"answer": "Answer text.", "sources": []}})
37
+
38
+ mock_pipeline.astream = delayed_astream
39
+
40
+ with patch("app.main.build_pipeline", return_value=mock_pipeline), \
41
+ patch("app.main.QdrantClient"), \
42
+ patch("app.services.embedder.Embedder"), \
43
+ patch("app.services.reranker.Reranker"):
44
+ from app.main import create_app
45
+
46
+ app = create_app()
47
+ app.state.pipeline = mock_pipeline
48
+ with TestClient(app, raise_server_exceptions=True) as client:
49
+ body = _chat(client, "Tell me about TextOps")
50
+
51
+ assert "event: status" in body
52
+ assert "Starting response..." in body
53
+ assert "event: ping" in body
54
+ assert "Answer text." in body
tests/test_generate_focus_selection.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.pipeline.nodes.generate import _select_chunks_for_prompt
2
+
3
+
4
+ def _chunk(title: str, section: str = "Overview") -> dict:
5
+ return {
6
+ "text": f"Info about {title}",
7
+ "metadata": {
8
+ "source_title": title,
9
+ "section": section,
10
+ "source_url": "",
11
+ "source_type": "project",
12
+ },
13
+ }
14
+
15
+
16
+ def test_select_chunks_prefers_explicitly_mentioned_source_title() -> None:
17
+ chunks = [
18
+ _chunk("Sorting Demo"),
19
+ _chunk("EchoEcho"),
20
+ _chunk("Donut.asm"),
21
+ ]
22
+
23
+ selected = _select_chunks_for_prompt(
24
+ "Is the source code on GitHub up-to-date with the Sorting Demo live demo?",
25
+ chunks,
26
+ )
27
+
28
+ assert selected
29
+ assert all(c["metadata"]["source_title"] == "Sorting Demo" for c in selected)
30
+
31
+
32
+ def test_select_chunks_falls_back_to_top_ranked_when_no_title_match() -> None:
33
+ chunks = [_chunk("A"), _chunk("B"), _chunk("C")]
34
+
35
+ selected = _select_chunks_for_prompt("What technologies are used?", chunks)
36
+
37
+ assert selected == chunks
tests/test_generate_quality_fallback.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.models.chat import SourceRef
2
+ from app.pipeline.nodes.generate import _build_low_trust_fallback
3
+
4
+
5
+ def test_low_trust_fallback_for_version_parity_queries() -> None:
6
+ sources = [
7
+ SourceRef(
8
+ title="Sorting Demo",
9
+ url="https://github.com/1337Xcode/sortingdemo",
10
+ section="Overview",
11
+ source_type="project",
12
+ )
13
+ ]
14
+
15
+ answer = _build_low_trust_fallback(
16
+ "Is the source code up-to-date with the online demo version?",
17
+ sources,
18
+ )
19
+
20
+ assert "cannot be verified" in answer
21
+ assert "[1]" in answer
22
+
23
+
24
+ def test_low_trust_fallback_general_query_is_concise() -> None:
25
+ sources = [
26
+ SourceRef(
27
+ title="Sorting Demo",
28
+ url="https://github.com/1337Xcode/sortingdemo",
29
+ section="Overview",
30
+ source_type="project",
31
+ )
32
+ ]
33
+
34
+ answer = _build_low_trust_fallback(
35
+ "What technology is used to build Sorting Demo?",
36
+ sources,
37
+ )
38
+
39
+ assert "Sorting Demo" in answer
40
+ assert "[1]" in answer
tests/test_parser_config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ ROOT = Path(__file__).resolve().parents[2]
6
+ if str(ROOT) not in sys.path:
7
+ sys.path.insert(0, str(ROOT))
8
+
9
+ from ingestion.parser_config import load_parser_config
10
+
11
+
12
+ def test_parser_config_defaults_all_enabled(monkeypatch) -> None:
13
+ for key in (
14
+ "INGEST_ENABLE_BLOG_MDX",
15
+ "INGEST_ENABLE_PROJECT_MDX",
16
+ "INGEST_ENABLE_PDF",
17
+ "INGEST_ENABLE_GITHUB_README",
18
+ ):
19
+ monkeypatch.delenv(key, raising=False)
20
+
21
+ cfg = load_parser_config()
22
+
23
+ assert cfg.enable_blog_mdx is True
24
+ assert cfg.enable_project_mdx is True
25
+ assert cfg.enable_pdf is True
26
+ assert cfg.enable_github_readme is True
27
+
28
+
29
+ def test_parser_config_can_disable_selective_parsers(monkeypatch) -> None:
30
+ monkeypatch.setenv("INGEST_ENABLE_BLOG_MDX", "false")
31
+ monkeypatch.setenv("INGEST_ENABLE_PROJECT_MDX", "true")
32
+ monkeypatch.setenv("INGEST_ENABLE_PDF", "0")
33
+ monkeypatch.setenv("INGEST_ENABLE_GITHUB_README", "yes")
34
+
35
+ cfg = load_parser_config()
36
+
37
+ assert cfg.enable_blog_mdx is False
38
+ assert cfg.enable_project_mdx is True
39
+ assert cfg.enable_pdf is False
40
+ assert cfg.enable_github_readme is True
tests/test_parser_sanitization.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ ROOT = Path(__file__).resolve().parents[2]
5
+ if str(ROOT) not in sys.path:
6
+ sys.path.insert(0, str(ROOT))
7
+
8
+ from ingestion.parsers.readme_parser import parse_readme_bytes
9
+ from ingestion.parsers.text_sanitizer import strip_html_tags
10
+
11
+
12
+ def test_strip_html_tags_removes_img_and_comments() -> None:
13
+ text = "Hello <!-- comment --> <img src='x'> world <b>bold</b>"
14
+ cleaned = strip_html_tags(text)
15
+ assert "<img" not in cleaned
16
+ assert "<!--" not in cleaned
17
+ assert "<b>" not in cleaned
18
+ assert "Hello" in cleaned and "world" in cleaned
19
+
20
+
21
+ def test_parse_readme_bytes_removes_raw_html() -> None:
22
+ readme = b"# Repo\n\n<img src='banner.png'/>\n\nSome content"
23
+ parsed = parse_readme_bytes(readme, repo_name="1337Xcode/demo")
24
+ assert "<img" not in parsed["clean_content"]
25
+ assert "Some content" in parsed["clean_content"]
26
+