Spaces:
Running
Running
GitHub Actions commited on
Commit ·
385ac95
1
Parent(s): 06307b1
Deploy fe36296
Browse files
app/pipeline/nodes/generate.py
CHANGED
|
@@ -2,6 +2,7 @@ import asyncio
|
|
| 2 |
import logging
|
| 3 |
import re
|
| 4 |
from typing import Callable
|
|
|
|
| 5 |
|
| 6 |
from langgraph.config import get_stream_writer
|
| 7 |
|
|
@@ -220,7 +221,7 @@ def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> li
|
|
| 220 |
seen: set[str] = set()
|
| 221 |
result: list[SourceRef] = []
|
| 222 |
for sr in source_refs:
|
| 223 |
-
key = sr
|
| 224 |
if key not in seen:
|
| 225 |
seen.add(key)
|
| 226 |
result.append(sr)
|
|
@@ -229,11 +230,35 @@ def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> li
|
|
| 229 |
return result
|
| 230 |
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
def _source_identity_key(source_ref: SourceRef) -> str:
|
| 233 |
"""Stable per-document identity used for citation/source deduping."""
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
| 237 |
|
| 238 |
|
| 239 |
def _reindex_citations_and_sources(answer: str, source_refs: list[SourceRef]) -> tuple[str, list[SourceRef]]:
|
|
|
|
| 2 |
import logging
|
| 3 |
import re
|
| 4 |
from typing import Callable
|
| 5 |
+
from urllib.parse import urlsplit, urlunsplit
|
| 6 |
|
| 7 |
from langgraph.config import get_stream_writer
|
| 8 |
|
|
|
|
| 221 |
seen: set[str] = set()
|
| 222 |
result: list[SourceRef] = []
|
| 223 |
for sr in source_refs:
|
| 224 |
+
key = _source_identity_key(sr)
|
| 225 |
if key not in seen:
|
| 226 |
seen.add(key)
|
| 227 |
result.append(sr)
|
|
|
|
| 230 |
return result
|
| 231 |
|
| 232 |
|
| 233 |
+
def _normalise_source_url(url: str) -> str:
|
| 234 |
+
"""Canonical URL form used for stable source identity comparisons."""
|
| 235 |
+
raw = (url or "").strip()
|
| 236 |
+
if not raw:
|
| 237 |
+
return ""
|
| 238 |
+
|
| 239 |
+
# Some source metadata is host/path without a scheme; normalise for parsing.
|
| 240 |
+
if "://" not in raw:
|
| 241 |
+
raw = f"https://{raw}"
|
| 242 |
+
|
| 243 |
+
parts = urlsplit(raw)
|
| 244 |
+
netloc = parts.netloc.lower()
|
| 245 |
+
if netloc.startswith("www."):
|
| 246 |
+
netloc = netloc[4:]
|
| 247 |
+
|
| 248 |
+
path = re.sub(r"/{2,}", "/", parts.path or "")
|
| 249 |
+
if path != "/":
|
| 250 |
+
path = path.rstrip("/")
|
| 251 |
+
|
| 252 |
+
scheme = (parts.scheme or "https").lower()
|
| 253 |
+
return urlunsplit((scheme, netloc, path, "", ""))
|
| 254 |
+
|
| 255 |
+
|
| 256 |
def _source_identity_key(source_ref: SourceRef) -> str:
|
| 257 |
"""Stable per-document identity used for citation/source deduping."""
|
| 258 |
+
normalised_url = _normalise_source_url(source_ref.url)
|
| 259 |
+
if normalised_url:
|
| 260 |
+
return normalised_url
|
| 261 |
+
return re.sub(r"\s+", " ", source_ref.title or "").strip().lower()
|
| 262 |
|
| 263 |
|
| 264 |
def _reindex_citations_and_sources(answer: str, source_refs: list[SourceRef]) -> tuple[str, list[SourceRef]]:
|
tests/test_generate_citation_reindex.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from app.models.chat import SourceRef
|
| 2 |
-
from app.pipeline.nodes.generate import _reindex_citations_and_sources
|
| 3 |
|
| 4 |
|
| 5 |
def test_reindex_citations_compacts_in_first_mention_order() -> None:
|
|
@@ -48,3 +48,40 @@ def test_reindex_citations_drops_out_of_range_markers() -> None:
|
|
| 48 |
assert "[1]" in new_answer
|
| 49 |
assert "[9]" not in new_answer
|
| 50 |
assert len(new_sources) == 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from app.models.chat import SourceRef
|
| 2 |
+
from app.pipeline.nodes.generate import _dedup_sources, _reindex_citations_and_sources
|
| 3 |
|
| 4 |
|
| 5 |
def test_reindex_citations_compacts_in_first_mention_order() -> None:
|
|
|
|
| 48 |
assert "[1]" in new_answer
|
| 49 |
assert "[9]" not in new_answer
|
| 50 |
assert len(new_sources) == 1
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_reindex_citations_merges_url_variants_same_document() -> None:
|
| 54 |
+
sources = [
|
| 55 |
+
SourceRef(
|
| 56 |
+
title="Resume",
|
| 57 |
+
url="https://darshanchheda.com/resume/",
|
| 58 |
+
section="Experience",
|
| 59 |
+
source_type="cv",
|
| 60 |
+
),
|
| 61 |
+
SourceRef(
|
| 62 |
+
title="Resume",
|
| 63 |
+
url="https://www.darshanchheda.com/resume?ref=nav#top",
|
| 64 |
+
section="Skills",
|
| 65 |
+
source_type="cv",
|
| 66 |
+
),
|
| 67 |
+
]
|
| 68 |
+
answer = "Resume evidence appears in both chunks [1][2]."
|
| 69 |
+
|
| 70 |
+
new_answer, new_sources = _reindex_citations_and_sources(answer, sources)
|
| 71 |
+
|
| 72 |
+
assert new_answer.count("[1]") == 1
|
| 73 |
+
assert "[2]" not in new_answer
|
| 74 |
+
assert len(new_sources) == 1
|
| 75 |
+
assert new_sources[0].title == "Resume"
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_dedup_sources_merges_url_variants() -> None:
|
| 79 |
+
sources = [
|
| 80 |
+
SourceRef(title="Resume", url="darshanchheda.com/resume", section="", source_type="cv"),
|
| 81 |
+
SourceRef(title="Resume", url="https://darshanchheda.com/resume/", section="", source_type="cv"),
|
| 82 |
+
SourceRef(title="Project", url="https://darshanchheda.com/projects/textops", section="", source_type="project"),
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
deduped = _dedup_sources(sources)
|
| 86 |
+
|
| 87 |
+
assert [s.title for s in deduped] == ["Resume", "Project"]
|