GitHub Actions commited on
Commit
385ac95
·
1 Parent(s): 06307b1

Deploy fe36296

Browse files
app/pipeline/nodes/generate.py CHANGED
@@ -2,6 +2,7 @@ import asyncio
2
  import logging
3
  import re
4
  from typing import Callable
 
5
 
6
  from langgraph.config import get_stream_writer
7
 
@@ -220,7 +221,7 @@ def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> li
220
  seen: set[str] = set()
221
  result: list[SourceRef] = []
222
  for sr in source_refs:
223
- key = sr.url or sr.title
224
  if key not in seen:
225
  seen.add(key)
226
  result.append(sr)
@@ -229,11 +230,35 @@ def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> li
229
  return result
230
 
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  def _source_identity_key(source_ref: SourceRef) -> str:
233
  """Stable per-document identity used for citation/source deduping."""
234
- if source_ref.url:
235
- return source_ref.url.strip().lower()
236
- return source_ref.title.strip().lower()
 
237
 
238
 
239
  def _reindex_citations_and_sources(answer: str, source_refs: list[SourceRef]) -> tuple[str, list[SourceRef]]:
 
2
  import logging
3
  import re
4
  from typing import Callable
5
+ from urllib.parse import urlsplit, urlunsplit
6
 
7
  from langgraph.config import get_stream_writer
8
 
 
221
  seen: set[str] = set()
222
  result: list[SourceRef] = []
223
  for sr in source_refs:
224
+ key = _source_identity_key(sr)
225
  if key not in seen:
226
  seen.add(key)
227
  result.append(sr)
 
230
  return result
231
 
232
 
233
+ def _normalise_source_url(url: str) -> str:
234
+ """Canonical URL form used for stable source identity comparisons."""
235
+ raw = (url or "").strip()
236
+ if not raw:
237
+ return ""
238
+
239
+ # Some source metadata is host/path without a scheme; normalise for parsing.
240
+ if "://" not in raw:
241
+ raw = f"https://{raw}"
242
+
243
+ parts = urlsplit(raw)
244
+ netloc = parts.netloc.lower()
245
+ if netloc.startswith("www."):
246
+ netloc = netloc[4:]
247
+
248
+ path = re.sub(r"/{2,}", "/", parts.path or "")
249
+ if path != "/":
250
+ path = path.rstrip("/")
251
+
252
+ scheme = (parts.scheme or "https").lower()
253
+ return urlunsplit((scheme, netloc, path, "", ""))
254
+
255
+
256
  def _source_identity_key(source_ref: SourceRef) -> str:
257
  """Stable per-document identity used for citation/source deduping."""
258
+ normalised_url = _normalise_source_url(source_ref.url)
259
+ if normalised_url:
260
+ return normalised_url
261
+ return re.sub(r"\s+", " ", source_ref.title or "").strip().lower()
262
 
263
 
264
  def _reindex_citations_and_sources(answer: str, source_refs: list[SourceRef]) -> tuple[str, list[SourceRef]]:
tests/test_generate_citation_reindex.py CHANGED
@@ -1,5 +1,5 @@
1
  from app.models.chat import SourceRef
2
- from app.pipeline.nodes.generate import _reindex_citations_and_sources
3
 
4
 
5
  def test_reindex_citations_compacts_in_first_mention_order() -> None:
@@ -48,3 +48,40 @@ def test_reindex_citations_drops_out_of_range_markers() -> None:
48
  assert "[1]" in new_answer
49
  assert "[9]" not in new_answer
50
  assert len(new_sources) == 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from app.models.chat import SourceRef
2
+ from app.pipeline.nodes.generate import _dedup_sources, _reindex_citations_and_sources
3
 
4
 
5
  def test_reindex_citations_compacts_in_first_mention_order() -> None:
 
48
  assert "[1]" in new_answer
49
  assert "[9]" not in new_answer
50
  assert len(new_sources) == 1
51
+
52
+
53
+ def test_reindex_citations_merges_url_variants_same_document() -> None:
54
+ sources = [
55
+ SourceRef(
56
+ title="Resume",
57
+ url="https://darshanchheda.com/resume/",
58
+ section="Experience",
59
+ source_type="cv",
60
+ ),
61
+ SourceRef(
62
+ title="Resume",
63
+ url="https://www.darshanchheda.com/resume?ref=nav#top",
64
+ section="Skills",
65
+ source_type="cv",
66
+ ),
67
+ ]
68
+ answer = "Resume evidence appears in both chunks [1][2]."
69
+
70
+ new_answer, new_sources = _reindex_citations_and_sources(answer, sources)
71
+
72
+ assert new_answer.count("[1]") == 1
73
+ assert "[2]" not in new_answer
74
+ assert len(new_sources) == 1
75
+ assert new_sources[0].title == "Resume"
76
+
77
+
78
+ def test_dedup_sources_merges_url_variants() -> None:
79
+ sources = [
80
+ SourceRef(title="Resume", url="darshanchheda.com/resume", section="", source_type="cv"),
81
+ SourceRef(title="Resume", url="https://darshanchheda.com/resume/", section="", source_type="cv"),
82
+ SourceRef(title="Project", url="https://darshanchheda.com/projects/textops", section="", source_type="project"),
83
+ ]
84
+
85
+ deduped = _dedup_sources(sources)
86
+
87
+ assert [s.title for s in deduped] == ["Resume", "Project"]