Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on 6 days ago

Commit

385ac95

1 Parent(s): 06307b1

Deploy fe36296

Browse files

Files changed (2) hide show

app/pipeline/nodes/generate.py +29 -4
tests/test_generate_citation_reindex.py +38 -1

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -2,6 +2,7 @@ import asyncio
 import logging
 import re
 from typing import Callable
 from langgraph.config import get_stream_writer
@@ -220,7 +221,7 @@ def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> li
     seen: set[str] = set()
     result: list[SourceRef] = []
     for sr in source_refs:
-        key = sr.url or sr.title
         if key not in seen:
             seen.add(key)
             result.append(sr)
@@ -229,11 +230,35 @@ def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> li
     return result
 def _source_identity_key(source_ref: SourceRef) -> str:
     """Stable per-document identity used for citation/source deduping."""
-    if source_ref.url:
-        return source_ref.url.strip().lower()
-    return source_ref.title.strip().lower()
 def _reindex_citations_and_sources(answer: str, source_refs: list[SourceRef]) -> tuple[str, list[SourceRef]]:

 import logging
 import re
 from typing import Callable
+from urllib.parse import urlsplit, urlunsplit
 from langgraph.config import get_stream_writer
     seen: set[str] = set()
     result: list[SourceRef] = []
     for sr in source_refs:
+        key = _source_identity_key(sr)
         if key not in seen:
             seen.add(key)
             result.append(sr)
     return result
+def _normalise_source_url(url: str) -> str:
+    """Canonical URL form used for stable source identity comparisons."""
+    raw = (url or "").strip()
+    if not raw:
+        return ""
+    # Some source metadata is host/path without a scheme; normalise for parsing.
+    if "://" not in raw:
+        raw = f"https://{raw}"
+    parts = urlsplit(raw)
+    netloc = parts.netloc.lower()
+    if netloc.startswith("www."):
+        netloc = netloc[4:]
+    path = re.sub(r"/{2,}", "/", parts.path or "")
+    if path != "/":
+        path = path.rstrip("/")
+    scheme = (parts.scheme or "https").lower()
+    return urlunsplit((scheme, netloc, path, "", ""))
 def _source_identity_key(source_ref: SourceRef) -> str:
     """Stable per-document identity used for citation/source deduping."""
+    normalised_url = _normalise_source_url(source_ref.url)
+    if normalised_url:
+        return normalised_url
+    return re.sub(r"\s+", " ", source_ref.title or "").strip().lower()
 def _reindex_citations_and_sources(answer: str, source_refs: list[SourceRef]) -> tuple[str, list[SourceRef]]:

tests/test_generate_citation_reindex.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from app.models.chat import SourceRef
-from app.pipeline.nodes.generate import _reindex_citations_and_sources
 def test_reindex_citations_compacts_in_first_mention_order() -> None:
@@ -48,3 +48,40 @@ def test_reindex_citations_drops_out_of_range_markers() -> None:
     assert "[1]" in new_answer
     assert "[9]" not in new_answer
     assert len(new_sources) == 1

 from app.models.chat import SourceRef
+from app.pipeline.nodes.generate import _dedup_sources, _reindex_citations_and_sources
 def test_reindex_citations_compacts_in_first_mention_order() -> None:
     assert "[1]" in new_answer
     assert "[9]" not in new_answer
     assert len(new_sources) == 1
+def test_reindex_citations_merges_url_variants_same_document() -> None:
+    sources = [
+        SourceRef(
+            title="Resume",
+            url="https://darshanchheda.com/resume/",
+            section="Experience",
+            source_type="cv",
+        ),
+        SourceRef(
+            title="Resume",
+            url="https://www.darshanchheda.com/resume?ref=nav#top",
+            section="Skills",
+            source_type="cv",
+        ),
+    ]
+    answer = "Resume evidence appears in both chunks [1][2]."
+    new_answer, new_sources = _reindex_citations_and_sources(answer, sources)
+    assert new_answer.count("[1]") == 1
+    assert "[2]" not in new_answer
+    assert len(new_sources) == 1
+    assert new_sources[0].title == "Resume"
+def test_dedup_sources_merges_url_variants() -> None:
+    sources = [
+        SourceRef(title="Resume", url="darshanchheda.com/resume", section="", source_type="cv"),
+        SourceRef(title="Resume", url="https://darshanchheda.com/resume/", section="", source_type="cv"),
+        SourceRef(title="Project", url="https://darshanchheda.com/projects/textops", section="", source_type="project"),
+    ]
+    deduped = _dedup_sources(sources)
+    assert [s.title for s in deduped] == ["Resume", "Project"]