nothex commited on
Commit
0f5e813
·
1 Parent(s): 96732d9

feat: add safe operator URL ingestion

Browse files
.env.example CHANGED
@@ -13,6 +13,10 @@ NVIDIA_API_BASE_URL=https://integrate.api.nvidia.com/v1
13
  # Admin review backend
14
  ADMIN_REVIEW_PROVIDER=auto
15
  ADMIN_REVIEW_MODEL=gemma4:latest
 
 
 
 
16
 
17
  # Supabase
18
  SUPABASE_URL=https://example.supabase.co
 
13
  # Admin review backend
14
  ADMIN_REVIEW_PROVIDER=auto
15
  ADMIN_REVIEW_MODEL=gemma4:latest
16
+ URL_INGEST_ADMIN_ENABLED=true
17
+ URL_INGEST_ALLOWED_HOSTS=
18
+ URL_INGEST_TIMEOUT_S=12
19
+ URL_INGEST_MAX_BYTES=1500000
20
 
21
  # Supabase
22
  SUPABASE_URL=https://example.supabase.co
.gitignore CHANGED
@@ -17,6 +17,7 @@ note_to_me.txt
17
 
18
  .dual-graph/
19
  tests/_tmp_graph_hybrid/
 
20
  tests/_tmp_pytest*/
21
  tests/_tmp_intent_monitor/
22
  tests/_tmp_intent_rollback/
 
17
 
18
  .dual-graph/
19
  tests/_tmp_graph_hybrid/
20
+ tests/_tmp_url_ingestion/
21
  tests/_tmp_pytest*/
22
  tests/_tmp_intent_monitor/
23
  tests/_tmp_intent_rollback/
backend/api/admin.py CHANGED
@@ -12,6 +12,8 @@ from pydantic import BaseModel
12
  from backend.core.auth_utils import require_auth_token
13
  from backend.core import config
14
  from backend.core.code_graph import index_python_codebase
 
 
15
  from backend.core.warmup_classifier import warmup, warmup_cross_encoder
16
  from backend.core.pipeline import _build_service_supabase_client
17
 
@@ -67,6 +69,11 @@ class CodeGraphIndexPayload(BaseModel):
67
  label: Optional[str] = None
68
 
69
 
 
 
 
 
 
70
  def _admin_client():
71
  return _build_service_supabase_client()
72
 
@@ -110,6 +117,11 @@ def _resolve_code_graph_root(root_path: str) -> Path:
110
  )
111
 
112
 
 
 
 
 
 
113
  def _trace_sort_key(row: dict):
114
  return row.get("created_at") or ""
115
 
@@ -923,3 +935,65 @@ def run_code_graph_index(
923
  "root_path": str(root),
924
  "result": result,
925
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from backend.core.auth_utils import require_auth_token
13
  from backend.core import config
14
  from backend.core.code_graph import index_python_codebase
15
+ from backend.core.tasks import process_document_task
16
+ from backend.core.url_ingestion import UrlIngestionError, fetch_url_to_tempfile
17
  from backend.core.warmup_classifier import warmup, warmup_cross_encoder
18
  from backend.core.pipeline import _build_service_supabase_client
19
 
 
69
  label: Optional[str] = None
70
 
71
 
72
+ class UrlIngestPayload(BaseModel):
73
+ url: str
74
+ label: Optional[str] = None
75
+
76
+
77
  def _admin_client():
78
  return _build_service_supabase_client()
79
 
 
117
  )
118
 
119
 
120
+ def _ensure_url_ingest_enabled() -> None:
121
+ if not config.URL_INGEST_ADMIN_ENABLED:
122
+ raise HTTPException(status_code=403, detail="Operator URL ingestion is disabled.")
123
+
124
+
125
  def _trace_sort_key(row: dict):
126
  return row.get("created_at") or ""
127
 
 
935
  "root_path": str(root),
936
  "result": result,
937
  }
938
+
939
+
940
+ @router.get("/graph/url-ingest/options")
941
+ def get_url_ingest_options(
942
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
943
+ user_id: str = Depends(require_auth_token),
944
+ ):
945
+ _check_admin(x_admin_key)
946
+ return {
947
+ "ok": True,
948
+ "enabled": bool(config.URL_INGEST_ADMIN_ENABLED),
949
+ "allowed_hosts": list(config.URL_INGEST_ALLOWED_HOSTS),
950
+ "timeout_s": config.URL_INGEST_TIMEOUT_S,
951
+ "max_bytes": config.URL_INGEST_MAX_BYTES,
952
+ "user_id": user_id,
953
+ }
954
+
955
+
956
+ @router.post("/graph/url-ingest")
957
+ def run_url_ingest(
958
+ payload: UrlIngestPayload,
959
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
960
+ x_auth_token: str = Header(..., alias="X-Auth-Token"),
961
+ user_id: str = Depends(require_auth_token),
962
+ ):
963
+ del user_id
964
+ _check_admin(x_admin_key)
965
+ _ensure_url_ingest_enabled()
966
+ try:
967
+ fetched = fetch_url_to_tempfile(payload.url, label=payload.label)
968
+ except UrlIngestionError as exc:
969
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
970
+ except Exception as exc:
971
+ log.error("Operator URL ingestion failed for %s: %s", payload.url, exc)
972
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
973
+
974
+ try:
975
+ task = process_document_task.delay(
976
+ fetched.temp_path,
977
+ fetched.filename,
978
+ x_auth_token,
979
+ "url",
980
+ "hybrid",
981
+ "url_fetch",
982
+ )
983
+ except Exception:
984
+ try:
985
+ os.unlink(fetched.temp_path)
986
+ except OSError:
987
+ log.warning("Could not remove fetched URL temp file %s", fetched.temp_path)
988
+ raise
989
+
990
+ return {
991
+ "ok": True,
992
+ "task_id": task.id,
993
+ "source_url": fetched.source_url,
994
+ "final_url": fetched.final_url,
995
+ "filename": fetched.filename,
996
+ "content_type": fetched.content_type,
997
+ "content_bytes": fetched.content_bytes,
998
+ "title": fetched.title,
999
+ }
backend/core/config.py CHANGED
@@ -279,6 +279,12 @@ CODE_GRAPH_ADMIN_ENABLED = os.getenv("CODE_GRAPH_ADMIN_ENABLED", "true").strip()
279
  "yes",
280
  "on",
281
  }
 
 
 
 
 
 
282
  CODE_GRAPH_ALLOWED_ROOTS = [
283
  candidate
284
  for candidate in [
@@ -291,6 +297,22 @@ CODE_GRAPH_ALLOWED_ROOTS = [
291
  ]
292
  if candidate
293
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
 
296
  def get_retrieval_profile(document_type: str | None) -> dict[str, float]:
 
279
  "yes",
280
  "on",
281
  }
282
+ URL_INGEST_ADMIN_ENABLED = os.getenv("URL_INGEST_ADMIN_ENABLED", "true").strip().lower() in {
283
+ "1",
284
+ "true",
285
+ "yes",
286
+ "on",
287
+ }
288
  CODE_GRAPH_ALLOWED_ROOTS = [
289
  candidate
290
  for candidate in [
 
297
  ]
298
  if candidate
299
  ]
300
+ URL_INGEST_ALLOWED_HOSTS = [
301
+ segment.strip().lower()
302
+ for segment in os.getenv("URL_INGEST_ALLOWED_HOSTS", "").split(",")
303
+ if segment.strip()
304
+ ]
305
+ URL_INGEST_ALLOWED_CONTENT_TYPES = {
306
+ "text/html",
307
+ "application/xhtml+xml",
308
+ "text/plain",
309
+ "text/markdown",
310
+ }
311
+ URL_INGEST_TIMEOUT_S = float(os.getenv("URL_INGEST_TIMEOUT_S", "12"))
312
+ URL_INGEST_MAX_BYTES = int(os.getenv("URL_INGEST_MAX_BYTES", "1500000"))
313
+ URL_INGEST_USER_AGENT = os.getenv(
314
+ "URL_INGEST_USER_AGENT", "MorpheusBot/1.0 (+https://nothex-morpheus-rag.hf.space)"
315
+ ).strip()
316
 
317
 
318
  def get_retrieval_profile(document_type: str | None) -> dict[str, float]:
backend/core/pipeline_ingestion.py CHANGED
@@ -406,6 +406,9 @@ def run_ingestion(
406
  original_filename: str = None,
407
  access_token: str = None,
408
  pdf_path: Optional[str] = None,
 
 
 
409
  ) -> str:
410
  """
411
  Ingestion orchestrator.
@@ -447,6 +450,12 @@ def run_ingestion(
447
  resolved_path,
448
  original_filename=original_filename,
449
  )
 
 
 
 
 
 
450
  log.info("=" * 50)
451
  log.info("Starting ingestion: %s", resolved_path)
452
 
@@ -502,12 +511,12 @@ def run_ingestion(
502
  access_token=access_token,
503
  )
504
 
505
- if source_kind == "markdown":
506
- _progress(2, "Parsing Markdown structure…")
507
  else:
508
  _progress(2, "Partitioning PDF (OCR + layout detection)…")
509
  stage_started = time.perf_counter()
510
- if source_kind == "markdown":
511
  elements = partition_markdown(resolved_path)
512
  pdf_images = {}
513
  else:
@@ -517,6 +526,8 @@ def run_ingestion(
517
  if not elements:
518
  if source_kind == "markdown":
519
  raise ValueError("Markdown file appears empty or unreadable.")
 
 
520
  raise ValueError(
521
  "The PDF appears blank or unreadable. "
522
  "If scanned, ensure tesseract-ocr is installed."
@@ -528,6 +539,10 @@ def run_ingestion(
528
  raise ValueError(
529
  f"Markdown file contains almost no readable text ({text_chars} chars)."
530
  )
 
 
 
 
531
  raise ValueError(
532
  f"PDF contains almost no readable text ({text_chars} chars). "
533
  "May be corrupted or image-only without OCR layer."
@@ -567,7 +582,7 @@ def run_ingestion(
567
 
568
  _progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
569
  stage_started = time.perf_counter()
570
- if source_kind == "markdown":
571
  chunks = _create_markdown_chunks(elements)
572
  else:
573
  chunks = pipeline_facade.create_chunks(elements, text_chars=text_chars)
 
406
  original_filename: str = None,
407
  access_token: str = None,
408
  pdf_path: Optional[str] = None,
409
+ source_kind_override: Optional[str] = None,
410
+ data_shape_override: Optional[str] = None,
411
+ parser_kind_override: Optional[str] = None,
412
  ) -> str:
413
  """
414
  Ingestion orchestrator.
 
450
  resolved_path,
451
  original_filename=original_filename,
452
  )
453
+ if source_kind_override:
454
+ source_kind = str(source_kind_override).strip().lower()
455
+ if data_shape_override:
456
+ data_shape = str(data_shape_override).strip().lower()
457
+ if parser_kind_override:
458
+ parser_kind = str(parser_kind_override).strip().lower()
459
  log.info("=" * 50)
460
  log.info("Starting ingestion: %s", resolved_path)
461
 
 
511
  access_token=access_token,
512
  )
513
 
514
+ if source_kind in {"markdown", "url"}:
515
+ _progress(2, "Parsing document structure…")
516
  else:
517
  _progress(2, "Partitioning PDF (OCR + layout detection)…")
518
  stage_started = time.perf_counter()
519
+ if source_kind in {"markdown", "url"}:
520
  elements = partition_markdown(resolved_path)
521
  pdf_images = {}
522
  else:
 
526
  if not elements:
527
  if source_kind == "markdown":
528
  raise ValueError("Markdown file appears empty or unreadable.")
529
+ if source_kind == "url":
530
+ raise ValueError("Fetched URL content appears empty or unreadable.")
531
  raise ValueError(
532
  "The PDF appears blank or unreadable. "
533
  "If scanned, ensure tesseract-ocr is installed."
 
539
  raise ValueError(
540
  f"Markdown file contains almost no readable text ({text_chars} chars)."
541
  )
542
+ if source_kind == "url":
543
+ raise ValueError(
544
+ f"Fetched URL content contains almost no readable text ({text_chars} chars)."
545
+ )
546
  raise ValueError(
547
  f"PDF contains almost no readable text ({text_chars} chars). "
548
  "May be corrupted or image-only without OCR layer."
 
582
 
583
  _progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
584
  stage_started = time.perf_counter()
585
+ if source_kind in {"markdown", "url"}:
586
  chunks = _create_markdown_chunks(elements)
587
  else:
588
  chunks = pipeline_facade.create_chunks(elements, text_chars=text_chars)
backend/core/tasks.py CHANGED
@@ -54,7 +54,15 @@ def _cleanup_temp_upload(tmp_path: str) -> None:
54
  log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
55
 
56
 
57
- def _process_document_task_impl(self, tmp_path: str, original_filename: str, access_token: str):
 
 
 
 
 
 
 
 
58
  """
59
  This runs in a completely separate background process!
60
  We pass a progress_callback to run_ingestion so it can report its status.
@@ -72,6 +80,9 @@ def _process_document_task_impl(self, tmp_path: str, original_filename: str, acc
72
  original_filename=original_filename,
73
  progress_callback=update_progress,
74
  access_token=access_token,
 
 
 
75
  )
76
  finally:
77
  _cleanup_temp_upload(tmp_path)
 
54
  log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
55
 
56
 
57
+ def _process_document_task_impl(
58
+ self,
59
+ tmp_path: str,
60
+ original_filename: str,
61
+ access_token: str,
62
+ source_kind_override: str | None = None,
63
+ data_shape_override: str | None = None,
64
+ parser_kind_override: str | None = None,
65
+ ):
66
  """
67
  This runs in a completely separate background process!
68
  We pass a progress_callback to run_ingestion so it can report its status.
 
80
  original_filename=original_filename,
81
  progress_callback=update_progress,
82
  access_token=access_token,
83
+ source_kind_override=source_kind_override,
84
+ data_shape_override=data_shape_override,
85
+ parser_kind_override=parser_kind_override,
86
  )
87
  finally:
88
  _cleanup_temp_upload(tmp_path)
backend/core/url_ingestion.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Operator-safe URL ingestion helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from html.parser import HTMLParser
7
+ import ipaddress
8
+ import os
9
+ from pathlib import Path
10
+ import socket
11
+ import tempfile
12
+ from typing import Iterable
13
+ from urllib.parse import urlparse
14
+
15
+ import requests
16
+
17
+ from backend.core import config
18
+
19
+
20
+ class UrlIngestionError(ValueError):
21
+ """Raised when a URL cannot be fetched safely for ingestion."""
22
+
23
+
24
+ @dataclass
25
+ class FetchedUrlDocument:
26
+ source_url: str
27
+ final_url: str
28
+ filename: str
29
+ temp_path: str
30
+ content_type: str
31
+ content_bytes: int
32
+ title: str | None = None
33
+
34
+
35
+ class _VisibleTextParser(HTMLParser):
36
+ def __init__(self) -> None:
37
+ super().__init__()
38
+ self._skip_depth = 0
39
+ self._parts: list[str] = []
40
+ self._title_parts: list[str] = []
41
+ self._inside_title = False
42
+
43
+ def handle_starttag(self, tag: str, attrs) -> None: # noqa: ANN001
44
+ del attrs
45
+ if tag in {"script", "style", "noscript"}:
46
+ self._skip_depth += 1
47
+ if tag == "title":
48
+ self._inside_title = True
49
+
50
+ def handle_endtag(self, tag: str) -> None:
51
+ if tag in {"script", "style", "noscript"} and self._skip_depth > 0:
52
+ self._skip_depth -= 1
53
+ if tag == "title":
54
+ self._inside_title = False
55
+ if tag in {"p", "div", "section", "article", "li", "br", "h1", "h2", "h3", "h4"}:
56
+ self._parts.append("\n")
57
+
58
+ def handle_data(self, data: str) -> None:
59
+ if self._skip_depth > 0:
60
+ return
61
+ text = " ".join(data.split())
62
+ if not text:
63
+ return
64
+ if self._inside_title:
65
+ self._title_parts.append(text)
66
+ self._parts.append(text)
67
+
68
+ @property
69
+ def title(self) -> str | None:
70
+ title = " ".join(self._title_parts).strip()
71
+ return title or None
72
+
73
+ @property
74
+ def text(self) -> str:
75
+ chunks: list[str] = []
76
+ for part in self._parts:
77
+ if part == "\n":
78
+ if not chunks or chunks[-1] == "\n":
79
+ continue
80
+ chunks.append("\n")
81
+ continue
82
+ if chunks and chunks[-1] not in {"\n", ""}:
83
+ chunks.append(" ")
84
+ chunks.append(part)
85
+ text = "".join(chunks)
86
+ lines = [line.strip() for line in text.splitlines()]
87
+ return "\n".join(line for line in lines if line)
88
+
89
+
90
+ def _normalize_host(hostname: str | None) -> str:
91
+ if not hostname:
92
+ raise UrlIngestionError("URL must include a hostname.")
93
+ return hostname.strip().lower().rstrip(".")
94
+
95
+
96
+ def _resolve_allowed_hosts() -> list[str]:
97
+ seen: set[str] = set()
98
+ allowed: list[str] = []
99
+ for raw in config.URL_INGEST_ALLOWED_HOSTS:
100
+ host = _normalize_host(raw)
101
+ if host in seen:
102
+ continue
103
+ seen.add(host)
104
+ allowed.append(host)
105
+ return allowed
106
+
107
+
108
+ def _host_is_allowed(hostname: str, allowed_hosts: Iterable[str]) -> bool:
109
+ allowed = list(allowed_hosts)
110
+ if not allowed:
111
+ return True
112
+ for allowed_host in allowed:
113
+ if hostname == allowed_host or hostname.endswith(f".{allowed_host}"):
114
+ return True
115
+ return False
116
+
117
+
118
+ def _ensure_safe_host(parsed_url) -> None: # noqa: ANN001
119
+ hostname = _normalize_host(parsed_url.hostname)
120
+ if parsed_url.username or parsed_url.password:
121
+ raise UrlIngestionError("Authenticated URLs are not allowed.")
122
+ if parsed_url.scheme not in {"http", "https"}:
123
+ raise UrlIngestionError("Only http and https URLs are supported.")
124
+ if parsed_url.port not in {None, 80, 443}:
125
+ raise UrlIngestionError("Only standard ports 80 and 443 are allowed.")
126
+ if not _host_is_allowed(hostname, _resolve_allowed_hosts()):
127
+ raise UrlIngestionError("Requested host is not in the URL ingestion allowlist.")
128
+
129
+ try:
130
+ addr_info = socket.getaddrinfo(hostname, parsed_url.port or 443, type=socket.SOCK_STREAM)
131
+ except socket.gaierror as exc:
132
+ raise UrlIngestionError(f"Could not resolve host: {hostname}") from exc
133
+
134
+ for _family, _socktype, _proto, _canonname, sockaddr in addr_info:
135
+ raw_ip = sockaddr[0]
136
+ ip = ipaddress.ip_address(raw_ip)
137
+ if (
138
+ ip.is_private
139
+ or ip.is_loopback
140
+ or ip.is_link_local
141
+ or ip.is_multicast
142
+ or ip.is_reserved
143
+ or ip.is_unspecified
144
+ ):
145
+ raise UrlIngestionError("URL resolves to a blocked private or non-routable address.")
146
+
147
+
148
+ def _sanitize_filename(url: str, title: str | None) -> str:
149
+ parsed = urlparse(url)
150
+ stem = title or Path(parsed.path).stem or parsed.hostname or "url-document"
151
+ cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", " "} else "-" for ch in stem)
152
+ cleaned = "-".join(cleaned.split()).strip("-_")
153
+ cleaned = cleaned[:80] or "url-document"
154
+ return f"{cleaned}.md"
155
+
156
+
157
+ def _render_markdown_from_remote(source_url: str, title: str | None, body: str) -> str:
158
+ parts = []
159
+ if title:
160
+ parts.append(f"# {title}")
161
+ parts.append(f"Source URL: {source_url}")
162
+ parts.append("")
163
+ parts.append(body.strip())
164
+ return "\n".join(parts).strip() + "\n"
165
+
166
+
167
+ def _extract_text_payload(content_type: str, raw_bytes: bytes) -> tuple[str | None, str]:
168
+ lowered = content_type.lower()
169
+ decoded = raw_bytes.decode("utf-8", errors="replace")
170
+ if lowered.startswith("text/html") or "application/xhtml+xml" in lowered:
171
+ parser = _VisibleTextParser()
172
+ parser.feed(decoded)
173
+ text = parser.text
174
+ if not text.strip():
175
+ raise UrlIngestionError("Fetched HTML did not contain readable text.")
176
+ return parser.title, text
177
+ text = decoded.strip()
178
+ if not text:
179
+ raise UrlIngestionError("Fetched URL returned empty text.")
180
+ return None, text
181
+
182
+
183
+ def fetch_url_to_tempfile(url: str, *, label: str | None = None) -> FetchedUrlDocument:
184
+ parsed = urlparse((url or "").strip())
185
+ _ensure_safe_host(parsed)
186
+
187
+ with requests.Session() as session:
188
+ response = session.get(
189
+ parsed.geturl(),
190
+ allow_redirects=True,
191
+ timeout=config.URL_INGEST_TIMEOUT_S,
192
+ stream=True,
193
+ headers={"User-Agent": config.URL_INGEST_USER_AGENT},
194
+ )
195
+
196
+ try:
197
+ response.raise_for_status()
198
+ except requests.HTTPError as exc:
199
+ raise UrlIngestionError(f"URL fetch failed with status {response.status_code}.") from exc
200
+
201
+ final_url = response.url or parsed.geturl()
202
+ _ensure_safe_host(urlparse(final_url))
203
+
204
+ content_type = (response.headers.get("content-type") or "").split(";", 1)[0].strip().lower()
205
+ if content_type not in config.URL_INGEST_ALLOWED_CONTENT_TYPES:
206
+ raise UrlIngestionError(f"Unsupported URL content type: {content_type or 'unknown'}.")
207
+
208
+ declared_length = response.headers.get("content-length")
209
+ if declared_length:
210
+ try:
211
+ if int(declared_length) > config.URL_INGEST_MAX_BYTES:
212
+ raise UrlIngestionError("Remote URL response exceeds the configured size limit.")
213
+ except ValueError:
214
+ pass
215
+
216
+ body = bytearray()
217
+ for chunk in response.iter_content(chunk_size=65536):
218
+ if not chunk:
219
+ continue
220
+ body.extend(chunk)
221
+ if len(body) > config.URL_INGEST_MAX_BYTES:
222
+ raise UrlIngestionError("Remote URL response exceeds the configured size limit.")
223
+
224
+ title, text = _extract_text_payload(content_type, bytes(body))
225
+ rendered = _render_markdown_from_remote(final_url, label or title, text)
226
+ tmp_fd, tmp_path = tempfile.mkstemp(suffix=".md", prefix="morpheus_url_")
227
+ os.close(tmp_fd)
228
+ with open(tmp_path, "w", encoding="utf-8") as handle:
229
+ handle.write(rendered)
230
+
231
+ return FetchedUrlDocument(
232
+ source_url=parsed.geturl(),
233
+ final_url=final_url,
234
+ filename=_sanitize_filename(final_url, label or title),
235
+ temp_path=tmp_path,
236
+ content_type=content_type,
237
+ content_bytes=len(body),
238
+ title=label or title,
239
+ )
docs/operations_playbook.md CHANGED
@@ -21,6 +21,7 @@
21
  ## Graph Workflow
22
  - Use Graph view for summary, search, path, and export
23
  - Code indexing is operator-only and restricted to allowed roots
 
24
  - Keep PDF answer-first flow unchanged while graph features expand
25
 
26
  ## Admin Review Backends
 
21
  ## Graph Workflow
22
  - Use Graph view for summary, search, path, and export
23
  - Code indexing is operator-only and restricted to allowed roots
24
+ - URL ingestion is operator-only and restricted by host allowlist plus public-IP checks
25
  - Keep PDF answer-first flow unchanged while graph features expand
26
 
27
  ## Admin Review Backends
docs/release_checklist.md CHANGED
@@ -22,9 +22,11 @@
22
  - Admin review loads traces and evaluation datasets
23
  - Admin draft review works with the configured provider (`ollama` or NVIDIA hosted)
24
  - Graph summary/search/export endpoints return tenant-scoped data
 
25
 
26
  ## Operator Checks
27
  - Confirm reviewed eval rows and active eval rows are non-zero
28
  - Confirm `query_traces` are still recording version metadata
29
  - Confirm cache invalidation by feedback still works
30
  - Confirm graph runs appear after code indexing
 
 
22
  - Admin review loads traces and evaluation datasets
23
  - Admin draft review works with the configured provider (`ollama` or NVIDIA hosted)
24
  - Graph summary/search/export endpoints return tenant-scoped data
25
+ - Operator URL ingest accepts only safe public hosts and queues background ingestion
26
 
27
  ## Operator Checks
28
  - Confirm reviewed eval rows and active eval rows are non-zero
29
  - Confirm `query_traces` are still recording version metadata
30
  - Confirm cache invalidation by feedback still works
31
  - Confirm graph runs appear after code indexing
32
+ - Confirm URL ingest respects allowlisted hosts and blocks private-network targets
frontend/index.html CHANGED
@@ -503,6 +503,13 @@
503
  <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
504
  <button class="btn-primary" onclick="runOperatorCodeIndex()">INDEX PYTHON CODEBASE</button>
505
  </div>
 
 
 
 
 
 
 
506
  </div>
507
  </div>
508
  </div>
 
503
  <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
504
  <button class="btn-primary" onclick="runOperatorCodeIndex()">INDEX PYTHON CODEBASE</button>
505
  </div>
506
+ <div class="section-label" style="margin-top:16px;">Operator URL Ingest</div>
507
+ <div class="confirm-zone" id="graphUrlOperatorHelp">URL ingestion is unavailable.</div>
508
+ <input type="text" id="graphUrlInput" placeholder="https://example.com/docs/page" style="margin-top:10px;" />
509
+ <input type="text" id="graphUrlLabel" placeholder="Optional display label…" style="margin-top:8px;" />
510
+ <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
511
+ <button class="btn-primary" onclick="runOperatorUrlIngest()">INGEST URL</button>
512
+ </div>
513
  </div>
514
  </div>
515
  </div>
frontend/js/api.js CHANGED
@@ -212,6 +212,17 @@ async function apiAdminIndexCodeGraph(adminKey, payload) {
212
  });
213
  }
214
 
 
 
 
 
 
 
 
 
 
 
 
215
  // ── Corpus ────────────────────────────────────────────────────────────────────
216
  async function apiLoadFiles() {
217
  return apiFetch('/api/v1/corpus/files');
 
212
  });
213
  }
214
 
215
+ async function apiAdminGetUrlIngestOptions(adminKey) {
216
+ return apiAdminFetch('/api/v1/admin/graph/url-ingest/options', adminKey);
217
+ }
218
+
219
+ async function apiAdminIngestUrl(adminKey, payload) {
220
+ return apiAdminFetch('/api/v1/admin/graph/url-ingest', adminKey, {
221
+ method: 'POST',
222
+ body: JSON.stringify(payload),
223
+ });
224
+ }
225
+
226
  // ── Corpus ────────────────────────────────────────────────────────────────────
227
  async function apiLoadFiles() {
228
  return apiFetch('/api/v1/corpus/files');
frontend/js/graph.js CHANGED
@@ -4,7 +4,7 @@
4
  * - `corpus` mode keeps the original category/document force graph
5
  * - `hybrid` mode renders persisted graph nodes/edges from `/api/v1/graph/*`
6
  * - the right-hand Graph view is a minimal explorer for summary, search, path,
7
- * export, and admin-only code indexing
8
  */
9
 
10
  const GRAPH_KIND_COLORS = {
@@ -477,21 +477,31 @@ async function refreshGraphWorkspace() {
477
  async function refreshGraphOperatorState() {
478
  const card = document.getElementById('graphOperatorCard');
479
  const help = document.getElementById('graphOperatorHelp');
 
480
  if (!card || !help) return;
481
  if (!STATE.adminUnlocked || !STATE.adminKey) {
482
  card.style.display = 'none';
483
  return;
484
  }
485
- const options = await apiAdminGetCodeGraphOptions(STATE.adminKey);
 
 
 
486
  card.style.display = '';
487
  STATE.graphAllowedRoots = options.allowed_roots || [];
488
  STATE.graphIndexDefaultRoot = options.default_root || '';
 
489
  if (document.getElementById('graphIndexRoot') && !document.getElementById('graphIndexRoot').value) {
490
  document.getElementById('graphIndexRoot').value = STATE.graphIndexDefaultRoot || '';
491
  }
492
  help.innerHTML = STATE.graphAllowedRoots.length
493
  ? `Allowed roots:<br>${STATE.graphAllowedRoots.map(root => `<code>${esc(root)}</code>`).join('<br>')}`
494
  : 'No operator code roots are configured.';
 
 
 
 
 
495
  }
496
 
497
  async function runGraphSearch() {
@@ -560,6 +570,24 @@ async function runOperatorCodeIndex() {
560
  await refreshGraphWorkspace();
561
  }
562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  function downloadGraphExport() {
564
  const snapshot = STATE.graphSnapshot;
565
  if (!snapshot) {
@@ -666,6 +694,7 @@ window.refreshGraphOperatorState = refreshGraphOperatorState;
666
  window.runGraphSearch = runGraphSearch;
667
  window.runGraphPath = runGraphPath;
668
  window.runOperatorCodeIndex = runOperatorCodeIndex;
 
669
  window.downloadGraphExport = downloadGraphExport;
670
  window.selectGraphNode = selectGraphNode;
671
  window.selectGraphNodeByKey = selectGraphNodeByKey;
 
4
  * - `corpus` mode keeps the original category/document force graph
5
  * - `hybrid` mode renders persisted graph nodes/edges from `/api/v1/graph/*`
6
  * - the right-hand Graph view is a minimal explorer for summary, search, path,
7
+ * export, and admin-only code indexing / URL ingestion
8
  */
9
 
10
  const GRAPH_KIND_COLORS = {
 
477
  async function refreshGraphOperatorState() {
478
  const card = document.getElementById('graphOperatorCard');
479
  const help = document.getElementById('graphOperatorHelp');
480
+ const urlHelp = document.getElementById('graphUrlOperatorHelp');
481
  if (!card || !help) return;
482
  if (!STATE.adminUnlocked || !STATE.adminKey) {
483
  card.style.display = 'none';
484
  return;
485
  }
486
+ const [options, urlOptions] = await Promise.all([
487
+ apiAdminGetCodeGraphOptions(STATE.adminKey),
488
+ apiAdminGetUrlIngestOptions(STATE.adminKey),
489
+ ]);
490
  card.style.display = '';
491
  STATE.graphAllowedRoots = options.allowed_roots || [];
492
  STATE.graphIndexDefaultRoot = options.default_root || '';
493
+ STATE.graphAllowedHosts = urlOptions.allowed_hosts || [];
494
  if (document.getElementById('graphIndexRoot') && !document.getElementById('graphIndexRoot').value) {
495
  document.getElementById('graphIndexRoot').value = STATE.graphIndexDefaultRoot || '';
496
  }
497
  help.innerHTML = STATE.graphAllowedRoots.length
498
  ? `Allowed roots:<br>${STATE.graphAllowedRoots.map(root => `<code>${esc(root)}</code>`).join('<br>')}`
499
  : 'No operator code roots are configured.';
500
+ if (urlHelp) {
501
+ urlHelp.innerHTML = STATE.graphAllowedHosts.length
502
+ ? `Allowed hosts:<br>${STATE.graphAllowedHosts.map(host => `<code>${esc(host)}</code>`).join('<br>')}`
503
+ : 'No host allowlist is configured. URL ingestion still blocks private and non-routable hosts.';
504
+ }
505
  }
506
 
507
  async function runGraphSearch() {
 
570
  await refreshGraphWorkspace();
571
  }
572
 
573
+ async function runOperatorUrlIngest() {
574
+ if (!STATE.adminUnlocked || !STATE.adminKey) {
575
+ toast('Unlock operator tools first.', 'error');
576
+ return;
577
+ }
578
+ const url = document.getElementById('graphUrlInput')?.value?.trim() || '';
579
+ const label = document.getElementById('graphUrlLabel')?.value?.trim() || '';
580
+ if (!url) {
581
+ toast('Enter a URL to ingest first.', 'error');
582
+ return;
583
+ }
584
+ const result = await apiAdminIngestUrl(STATE.adminKey, {
585
+ url,
586
+ label: label || null,
587
+ });
588
+ toast(`Queued URL ingestion for ${result.final_url || result.source_url}.`, 'success');
589
+ }
590
+
591
  function downloadGraphExport() {
592
  const snapshot = STATE.graphSnapshot;
593
  if (!snapshot) {
 
694
  window.runGraphSearch = runGraphSearch;
695
  window.runGraphPath = runGraphPath;
696
  window.runOperatorCodeIndex = runOperatorCodeIndex;
697
+ window.runOperatorUrlIngest = runOperatorUrlIngest;
698
  window.downloadGraphExport = downloadGraphExport;
699
  window.selectGraphNode = selectGraphNode;
700
  window.selectGraphNodeByKey = selectGraphNodeByKey;
frontend/js/state.js CHANGED
@@ -36,6 +36,7 @@ const STATE = {
36
  graphSourceKind: '',
37
  graphAllowedRoots: [],
38
  graphIndexDefaultRoot: '',
 
39
  };
40
 
41
  function stateRefreshCategories() {
 
36
  graphSourceKind: '',
37
  graphAllowedRoots: [],
38
  graphIndexDefaultRoot: '',
39
+ graphAllowedHosts: [],
40
  };
41
 
42
  function stateRefreshCategories() {
tests/test_url_ingestion.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import shutil
3
+ from types import SimpleNamespace
4
+
5
+ import pytest
6
+
7
+ from backend.api import admin as admin_api
8
+ from backend.core import pipeline
9
+ from backend.core import url_ingestion
10
+
11
+
12
+ class _FakeTask:
13
+ id = "task-123"
14
+
15
+
16
+ class _FakeResponse:
17
+ def __init__(self, *, url: str, content_type: str, body: bytes, headers=None, status_code: int = 200):
18
+ self.url = url
19
+ self._body = body
20
+ self.headers = {"content-type": content_type, **(headers or {})}
21
+ self.status_code = status_code
22
+
23
+ def raise_for_status(self):
24
+ if self.status_code >= 400:
25
+ raise url_ingestion.requests.HTTPError(f"HTTP {self.status_code}")
26
+
27
+ def iter_content(self, chunk_size=65536):
28
+ for start in range(0, len(self._body), chunk_size):
29
+ yield self._body[start:start + chunk_size]
30
+
31
+
32
+ class _FakeSession:
33
+ def __init__(self, response):
34
+ self._response = response
35
+
36
+ def __enter__(self):
37
+ return self
38
+
39
+ def __exit__(self, exc_type, exc, tb):
40
+ return False
41
+
42
+ def get(self, *args, **kwargs):
43
+ del args, kwargs
44
+ return self._response
45
+
46
+
47
+ def test_fetch_url_to_tempfile_blocks_private_ip(monkeypatch):
48
+ monkeypatch.setattr(url_ingestion.config, "URL_INGEST_ALLOWED_HOSTS", [])
49
+ monkeypatch.setattr(
50
+ url_ingestion.socket,
51
+ "getaddrinfo",
52
+ lambda *args, **kwargs: [(None, None, None, None, ("127.0.0.1", 443))],
53
+ )
54
+
55
+ with pytest.raises(url_ingestion.UrlIngestionError):
56
+ url_ingestion.fetch_url_to_tempfile("https://example.com/internal")
57
+
58
+
59
+ def test_fetch_url_to_tempfile_extracts_readable_html(monkeypatch):
60
+ body = b"<html><head><title>Morpheus Docs</title></head><body><main><h1>Overview</h1><p>Hybrid retrieval works.</p></main></body></html>"
61
+ monkeypatch.setattr(url_ingestion.config, "URL_INGEST_ALLOWED_HOSTS", ["docs.example.com"])
62
+ monkeypatch.setattr(
63
+ url_ingestion.socket,
64
+ "getaddrinfo",
65
+ lambda *args, **kwargs: [(None, None, None, None, ("93.184.216.34", 443))],
66
+ )
67
+ monkeypatch.setattr(
68
+ url_ingestion.requests,
69
+ "Session",
70
+ lambda: _FakeSession(
71
+ _FakeResponse(
72
+ url="https://docs.example.com/guide",
73
+ content_type="text/html; charset=utf-8",
74
+ body=body,
75
+ )
76
+ ),
77
+ )
78
+
79
+ result = url_ingestion.fetch_url_to_tempfile("https://docs.example.com/guide")
80
+
81
+ assert result.final_url == "https://docs.example.com/guide"
82
+ assert result.content_type == "text/html"
83
+ with open(result.temp_path, "r", encoding="utf-8") as handle:
84
+ rendered = handle.read()
85
+ assert "# Morpheus Docs" in rendered
86
+ assert "Hybrid retrieval works." in rendered
87
+
88
+
89
+ def test_run_ingestion_with_url_override_persists_url_metadata(monkeypatch):
90
+ from tests.test_pipeline_regressions import FakeIngestionSupabase
91
+
92
+ fake_supabase = FakeIngestionSupabase()
93
+ captured = {}
94
+ root = Path("tests") / "_tmp_graph_hybrid" / "url_ingestion"
95
+ root.mkdir(parents=True, exist_ok=True)
96
+ try:
97
+ source_path = root / "remote.md"
98
+ source_path.write_text("# Remote Doc\n\nThis URL import contains enough content to ingest.\n", encoding="utf-8")
99
+
100
+ monkeypatch.setattr("backend.core.auth_utils.extract_jwt_sub", lambda token: "user-1")
101
+ monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda path: "url-hash")
102
+ monkeypatch.setattr(pipeline, "is_file_already_ingested", lambda file_hash, access_token=None: False)
103
+ monkeypatch.setattr(pipeline, "_recover_or_prepare_orphaned_upload", lambda *args, **kwargs: None)
104
+ monkeypatch.setattr(pipeline, "_build_supabase_client", lambda access_token=None: fake_supabase)
105
+ monkeypatch.setattr(pipeline, "_build_service_supabase_client", lambda: fake_supabase)
106
+ monkeypatch.setattr(
107
+ pipeline,
108
+ "extract_document_entities",
109
+ lambda *args, **kwargs: SimpleNamespace(is_allowed=True, document_type="general_document"),
110
+ )
111
+ monkeypatch.setattr(pipeline, "_build_document_tree", lambda elements: {"title": "root", "children": []})
112
+ monkeypatch.setattr(
113
+ pipeline,
114
+ "process_chunks",
115
+ lambda chunks, elements, path_for_naming, file_hash, graph_data, user_id, pdf_images, **kwargs: (
116
+ captured.setdefault(
117
+ "docs",
118
+ [SimpleNamespace(metadata={**kwargs, "source": "Remote Doc", "title": "Remote Doc"})],
119
+ ),
120
+ ["doc-1"],
121
+ ),
122
+ )
123
+ monkeypatch.setattr(pipeline, "build_raptor_tree", lambda docs, ids, user_id: (docs, ids))
124
+ monkeypatch.setattr(pipeline, "_persist_graph_foundation", lambda **kwargs: None)
125
+ monkeypatch.setattr(pipeline, "upload_to_supabase", lambda docs, ids, access_token=None: None)
126
+ monkeypatch.setattr(pipeline, "_identity_json_from_elements", lambda elements, fallback_title=None: {})
127
+ monkeypatch.setattr(pipeline, "_extract_pdf_title", lambda elements, filename: "Remote Doc")
128
+ monkeypatch.setattr(pipeline, "_log_ingestion_retry_event", lambda **kwargs: None)
129
+
130
+ result = pipeline.run_ingestion(
131
+ file_path=str(source_path),
132
+ original_filename="remote-doc.md",
133
+ access_token="token",
134
+ source_kind_override="url",
135
+ data_shape_override="hybrid",
136
+ parser_kind_override="url_fetch",
137
+ )
138
+
139
+ assert result["filename"] == "Remote Doc"
140
+ assert result["file_hash"] == "url-hash"
141
+ assert captured["docs"][0].metadata["source_kind"] == "url"
142
+ assert captured["docs"][0].metadata["data_shape"] == "hybrid"
143
+ assert captured["docs"][0].metadata["parser_kind"] == "url_fetch"
144
+ ingested_upsert = next(item for item in fake_supabase.upserts if item[0] == "ingested_files")
145
+ assert ingested_upsert[1]["source_kind"] == "url"
146
+ assert ingested_upsert[1]["parser_kind"] == "url_fetch"
147
+ finally:
148
+ shutil.rmtree(root, ignore_errors=True)
149
+
150
+
151
+ def test_run_url_ingest_requires_admin_and_queues_task(monkeypatch):
152
+ monkeypatch.setattr(admin_api, "_check_admin", lambda key: None)
153
+ monkeypatch.setattr(admin_api.config, "URL_INGEST_ADMIN_ENABLED", True, raising=False)
154
+ monkeypatch.setattr(
155
+ admin_api,
156
+ "fetch_url_to_tempfile",
157
+ lambda url, label=None: SimpleNamespace(
158
+ source_url=url,
159
+ final_url=url,
160
+ filename="Morpheus-Docs.md",
161
+ temp_path="C:/tmp/morpheus-url.md",
162
+ content_type="text/html",
163
+ content_bytes=1234,
164
+ title=label or "Morpheus Docs",
165
+ ),
166
+ )
167
+ monkeypatch.setattr(admin_api.process_document_task, "delay", lambda *args: _FakeTask())
168
+
169
+ result = admin_api.run_url_ingest(
170
+ admin_api.UrlIngestPayload(url="https://docs.example.com/guide", label="Morpheus Docs"),
171
+ x_admin_key="admin",
172
+ x_auth_token="token",
173
+ user_id="user-1",
174
+ )
175
+
176
+ assert result["ok"] is True
177
+ assert result["task_id"] == "task-123"
178
+ assert result["filename"] == "Morpheus-Docs.md"