Spaces:

nothex
/

morpheus-rag

Running

App Files Files Community

nothex commited on 11 days ago

Commit

0f5e813

1 Parent(s): 96732d9

feat: add safe operator URL ingestion

Browse files

Files changed (14) hide show

.env.example +4 -0
.gitignore +1 -0
backend/api/admin.py +74 -0
backend/core/config.py +22 -0
backend/core/pipeline_ingestion.py +19 -4
backend/core/tasks.py +12 -1
backend/core/url_ingestion.py +239 -0
docs/operations_playbook.md +1 -0
docs/release_checklist.md +2 -0
frontend/index.html +7 -0
frontend/js/api.js +11 -0
frontend/js/graph.js +31 -2
frontend/js/state.js +1 -0
tests/test_url_ingestion.py +178 -0

.env.example CHANGED Viewed

@@ -13,6 +13,10 @@ NVIDIA_API_BASE_URL=https://integrate.api.nvidia.com/v1
 # Admin review backend
 ADMIN_REVIEW_PROVIDER=auto
 ADMIN_REVIEW_MODEL=gemma4:latest
 # Supabase
 SUPABASE_URL=https://example.supabase.co

 # Admin review backend
 ADMIN_REVIEW_PROVIDER=auto
 ADMIN_REVIEW_MODEL=gemma4:latest
+URL_INGEST_ADMIN_ENABLED=true
+URL_INGEST_ALLOWED_HOSTS=
+URL_INGEST_TIMEOUT_S=12
+URL_INGEST_MAX_BYTES=1500000
 # Supabase
 SUPABASE_URL=https://example.supabase.co

.gitignore CHANGED Viewed

@@ -17,6 +17,7 @@ note_to_me.txt
 .dual-graph/
 tests/_tmp_graph_hybrid/
 tests/_tmp_pytest*/
 tests/_tmp_intent_monitor/
 tests/_tmp_intent_rollback/

 .dual-graph/
 tests/_tmp_graph_hybrid/
+tests/_tmp_url_ingestion/
 tests/_tmp_pytest*/
 tests/_tmp_intent_monitor/
 tests/_tmp_intent_rollback/

backend/api/admin.py CHANGED Viewed

@@ -12,6 +12,8 @@ from pydantic import BaseModel
 from backend.core.auth_utils import require_auth_token
 from backend.core import config
 from backend.core.code_graph import index_python_codebase
 from backend.core.warmup_classifier import warmup, warmup_cross_encoder
 from backend.core.pipeline import _build_service_supabase_client
@@ -67,6 +69,11 @@ class CodeGraphIndexPayload(BaseModel):
     label: Optional[str] = None
 def _admin_client():
     return _build_service_supabase_client()
@@ -110,6 +117,11 @@ def _resolve_code_graph_root(root_path: str) -> Path:
     )
 def _trace_sort_key(row: dict):
     return row.get("created_at") or ""
@@ -923,3 +935,65 @@ def run_code_graph_index(
         "root_path": str(root),
         "result": result,
     }

 from backend.core.auth_utils import require_auth_token
 from backend.core import config
 from backend.core.code_graph import index_python_codebase
+from backend.core.tasks import process_document_task
+from backend.core.url_ingestion import UrlIngestionError, fetch_url_to_tempfile
 from backend.core.warmup_classifier import warmup, warmup_cross_encoder
 from backend.core.pipeline import _build_service_supabase_client
     label: Optional[str] = None
+class UrlIngestPayload(BaseModel):
+    url: str
+    label: Optional[str] = None
 def _admin_client():
     return _build_service_supabase_client()
     )
+def _ensure_url_ingest_enabled() -> None:
+    if not config.URL_INGEST_ADMIN_ENABLED:
+        raise HTTPException(status_code=403, detail="Operator URL ingestion is disabled.")
 def _trace_sort_key(row: dict):
     return row.get("created_at") or ""
         "root_path": str(root),
         "result": result,
     }
+@router.get("/graph/url-ingest/options")
+def get_url_ingest_options(
+    x_admin_key: str = Header(..., alias="X-Admin-Key"),
+    user_id: str = Depends(require_auth_token),
+):
+    _check_admin(x_admin_key)
+    return {
+        "ok": True,
+        "enabled": bool(config.URL_INGEST_ADMIN_ENABLED),
+        "allowed_hosts": list(config.URL_INGEST_ALLOWED_HOSTS),
+        "timeout_s": config.URL_INGEST_TIMEOUT_S,
+        "max_bytes": config.URL_INGEST_MAX_BYTES,
+        "user_id": user_id,
+    }
+@router.post("/graph/url-ingest")
+def run_url_ingest(
+    payload: UrlIngestPayload,
+    x_admin_key: str = Header(..., alias="X-Admin-Key"),
+    x_auth_token: str = Header(..., alias="X-Auth-Token"),
+    user_id: str = Depends(require_auth_token),
+):
+    del user_id
+    _check_admin(x_admin_key)
+    _ensure_url_ingest_enabled()
+    try:
+        fetched = fetch_url_to_tempfile(payload.url, label=payload.label)
+    except UrlIngestionError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+    except Exception as exc:
+        log.error("Operator URL ingestion failed for %s: %s", payload.url, exc)
+        raise HTTPException(status_code=500, detail=str(exc)) from exc
+    try:
+        task = process_document_task.delay(
+            fetched.temp_path,
+            fetched.filename,
+            x_auth_token,
+            "url",
+            "hybrid",
+            "url_fetch",
+        )
+    except Exception:
+        try:
+            os.unlink(fetched.temp_path)
+        except OSError:
+            log.warning("Could not remove fetched URL temp file %s", fetched.temp_path)
+        raise
+    return {
+        "ok": True,
+        "task_id": task.id,
+        "source_url": fetched.source_url,
+        "final_url": fetched.final_url,
+        "filename": fetched.filename,
+        "content_type": fetched.content_type,
+        "content_bytes": fetched.content_bytes,
+        "title": fetched.title,
+    }

backend/core/config.py CHANGED Viewed

@@ -279,6 +279,12 @@ CODE_GRAPH_ADMIN_ENABLED = os.getenv("CODE_GRAPH_ADMIN_ENABLED", "true").strip()
     "yes",
     "on",
 }
 CODE_GRAPH_ALLOWED_ROOTS = [
     candidate
     for candidate in [
@@ -291,6 +297,22 @@ CODE_GRAPH_ALLOWED_ROOTS = [
     ]
     if candidate
 ]
 def get_retrieval_profile(document_type: str | None) -> dict[str, float]:

     "yes",
     "on",
 }
+URL_INGEST_ADMIN_ENABLED = os.getenv("URL_INGEST_ADMIN_ENABLED", "true").strip().lower() in {
+    "1",
+    "true",
+    "yes",
+    "on",
+}
 CODE_GRAPH_ALLOWED_ROOTS = [
     candidate
     for candidate in [
     ]
     if candidate
 ]
+URL_INGEST_ALLOWED_HOSTS = [
+    segment.strip().lower()
+    for segment in os.getenv("URL_INGEST_ALLOWED_HOSTS", "").split(",")
+    if segment.strip()
+]
+URL_INGEST_ALLOWED_CONTENT_TYPES = {
+    "text/html",
+    "application/xhtml+xml",
+    "text/plain",
+    "text/markdown",
+}
+URL_INGEST_TIMEOUT_S = float(os.getenv("URL_INGEST_TIMEOUT_S", "12"))
+URL_INGEST_MAX_BYTES = int(os.getenv("URL_INGEST_MAX_BYTES", "1500000"))
+URL_INGEST_USER_AGENT = os.getenv(
+    "URL_INGEST_USER_AGENT", "MorpheusBot/1.0 (+https://nothex-morpheus-rag.hf.space)"
+).strip()
 def get_retrieval_profile(document_type: str | None) -> dict[str, float]:

backend/core/pipeline_ingestion.py CHANGED Viewed

@@ -406,6 +406,9 @@ def run_ingestion(
     original_filename: str = None,
     access_token: str = None,
     pdf_path: Optional[str] = None,
 ) -> str:
     """
     Ingestion orchestrator.
@@ -447,6 +450,12 @@ def run_ingestion(
         resolved_path,
         original_filename=original_filename,
     )
     log.info("=" * 50)
     log.info("Starting ingestion: %s", resolved_path)
@@ -502,12 +511,12 @@ def run_ingestion(
             access_token=access_token,
         )
-    if source_kind == "markdown":
-        _progress(2, "Parsing Markdown structure…")
     else:
         _progress(2, "Partitioning PDF (OCR + layout detection)…")
     stage_started = time.perf_counter()
-    if source_kind == "markdown":
         elements = partition_markdown(resolved_path)
         pdf_images = {}
     else:
@@ -517,6 +526,8 @@ def run_ingestion(
     if not elements:
         if source_kind == "markdown":
             raise ValueError("Markdown file appears empty or unreadable.")
         raise ValueError(
             "The PDF appears blank or unreadable. "
             "If scanned, ensure tesseract-ocr is installed."
@@ -528,6 +539,10 @@ def run_ingestion(
             raise ValueError(
                 f"Markdown file contains almost no readable text ({text_chars} chars)."
             )
         raise ValueError(
             f"PDF contains almost no readable text ({text_chars} chars). "
             "May be corrupted or image-only without OCR layer."
@@ -567,7 +582,7 @@ def run_ingestion(
     _progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
     stage_started = time.perf_counter()
-    if source_kind == "markdown":
         chunks = _create_markdown_chunks(elements)
     else:
         chunks = pipeline_facade.create_chunks(elements, text_chars=text_chars)

     original_filename: str = None,
     access_token: str = None,
     pdf_path: Optional[str] = None,
+    source_kind_override: Optional[str] = None,
+    data_shape_override: Optional[str] = None,
+    parser_kind_override: Optional[str] = None,
 ) -> str:
     """
     Ingestion orchestrator.
         resolved_path,
         original_filename=original_filename,
     )
+    if source_kind_override:
+        source_kind = str(source_kind_override).strip().lower()
+    if data_shape_override:
+        data_shape = str(data_shape_override).strip().lower()
+    if parser_kind_override:
+        parser_kind = str(parser_kind_override).strip().lower()
     log.info("=" * 50)
     log.info("Starting ingestion: %s", resolved_path)
             access_token=access_token,
         )
+    if source_kind in {"markdown", "url"}:
+        _progress(2, "Parsing document structure…")
     else:
         _progress(2, "Partitioning PDF (OCR + layout detection)…")
     stage_started = time.perf_counter()
+    if source_kind in {"markdown", "url"}:
         elements = partition_markdown(resolved_path)
         pdf_images = {}
     else:
     if not elements:
         if source_kind == "markdown":
             raise ValueError("Markdown file appears empty or unreadable.")
+        if source_kind == "url":
+            raise ValueError("Fetched URL content appears empty or unreadable.")
         raise ValueError(
             "The PDF appears blank or unreadable. "
             "If scanned, ensure tesseract-ocr is installed."
             raise ValueError(
                 f"Markdown file contains almost no readable text ({text_chars} chars)."
             )
+        if source_kind == "url":
+            raise ValueError(
+                f"Fetched URL content contains almost no readable text ({text_chars} chars)."
+            )
         raise ValueError(
             f"PDF contains almost no readable text ({text_chars} chars). "
             "May be corrupted or image-only without OCR layer."
     _progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
     stage_started = time.perf_counter()
+    if source_kind in {"markdown", "url"}:
         chunks = _create_markdown_chunks(elements)
     else:
         chunks = pipeline_facade.create_chunks(elements, text_chars=text_chars)

backend/core/tasks.py CHANGED Viewed

@@ -54,7 +54,15 @@ def _cleanup_temp_upload(tmp_path: str) -> None:
         log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
-def _process_document_task_impl(self, tmp_path: str, original_filename: str, access_token: str):
     """
     This runs in a completely separate background process!
     We pass a progress_callback to run_ingestion so it can report its status.
@@ -72,6 +80,9 @@ def _process_document_task_impl(self, tmp_path: str, original_filename: str, acc
             original_filename=original_filename,
             progress_callback=update_progress,
             access_token=access_token,
         )
     finally:
         _cleanup_temp_upload(tmp_path)

         log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
+def _process_document_task_impl(
+    self,
+    tmp_path: str,
+    original_filename: str,
+    access_token: str,
+    source_kind_override: str | None = None,
+    data_shape_override: str | None = None,
+    parser_kind_override: str | None = None,
+):
     """
     This runs in a completely separate background process!
     We pass a progress_callback to run_ingestion so it can report its status.
             original_filename=original_filename,
             progress_callback=update_progress,
             access_token=access_token,
+            source_kind_override=source_kind_override,
+            data_shape_override=data_shape_override,
+            parser_kind_override=parser_kind_override,
         )
     finally:
         _cleanup_temp_upload(tmp_path)

backend/core/url_ingestion.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""Operator-safe URL ingestion helpers."""
+from __future__ import annotations
+from dataclasses import dataclass
+from html.parser import HTMLParser
+import ipaddress
+import os
+from pathlib import Path
+import socket
+import tempfile
+from typing import Iterable
+from urllib.parse import urlparse
+import requests
+from backend.core import config
+class UrlIngestionError(ValueError):
+    """Raised when a URL cannot be fetched safely for ingestion."""
+@dataclass
+class FetchedUrlDocument:
+    source_url: str
+    final_url: str
+    filename: str
+    temp_path: str
+    content_type: str
+    content_bytes: int
+    title: str | None = None
+class _VisibleTextParser(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__()
+        self._skip_depth = 0
+        self._parts: list[str] = []
+        self._title_parts: list[str] = []
+        self._inside_title = False
+    def handle_starttag(self, tag: str, attrs) -> None:  # noqa: ANN001
+        del attrs
+        if tag in {"script", "style", "noscript"}:
+            self._skip_depth += 1
+        if tag == "title":
+            self._inside_title = True
+    def handle_endtag(self, tag: str) -> None:
+        if tag in {"script", "style", "noscript"} and self._skip_depth > 0:
+            self._skip_depth -= 1
+        if tag == "title":
+            self._inside_title = False
+        if tag in {"p", "div", "section", "article", "li", "br", "h1", "h2", "h3", "h4"}:
+            self._parts.append("\n")
+    def handle_data(self, data: str) -> None:
+        if self._skip_depth > 0:
+            return
+        text = " ".join(data.split())
+        if not text:
+            return
+        if self._inside_title:
+            self._title_parts.append(text)
+        self._parts.append(text)
+    @property
+    def title(self) -> str | None:
+        title = " ".join(self._title_parts).strip()
+        return title or None
+    @property
+    def text(self) -> str:
+        chunks: list[str] = []
+        for part in self._parts:
+            if part == "\n":
+                if not chunks or chunks[-1] == "\n":
+                    continue
+                chunks.append("\n")
+                continue
+            if chunks and chunks[-1] not in {"\n", ""}:
+                chunks.append(" ")
+            chunks.append(part)
+        text = "".join(chunks)
+        lines = [line.strip() for line in text.splitlines()]
+        return "\n".join(line for line in lines if line)
+def _normalize_host(hostname: str | None) -> str:
+    if not hostname:
+        raise UrlIngestionError("URL must include a hostname.")
+    return hostname.strip().lower().rstrip(".")
+def _resolve_allowed_hosts() -> list[str]:
+    seen: set[str] = set()
+    allowed: list[str] = []
+    for raw in config.URL_INGEST_ALLOWED_HOSTS:
+        host = _normalize_host(raw)
+        if host in seen:
+            continue
+        seen.add(host)
+        allowed.append(host)
+    return allowed
+def _host_is_allowed(hostname: str, allowed_hosts: Iterable[str]) -> bool:
+    allowed = list(allowed_hosts)
+    if not allowed:
+        return True
+    for allowed_host in allowed:
+        if hostname == allowed_host or hostname.endswith(f".{allowed_host}"):
+            return True
+    return False
+def _ensure_safe_host(parsed_url) -> None:  # noqa: ANN001
+    hostname = _normalize_host(parsed_url.hostname)
+    if parsed_url.username or parsed_url.password:
+        raise UrlIngestionError("Authenticated URLs are not allowed.")
+    if parsed_url.scheme not in {"http", "https"}:
+        raise UrlIngestionError("Only http and https URLs are supported.")
+    if parsed_url.port not in {None, 80, 443}:
+        raise UrlIngestionError("Only standard ports 80 and 443 are allowed.")
+    if not _host_is_allowed(hostname, _resolve_allowed_hosts()):
+        raise UrlIngestionError("Requested host is not in the URL ingestion allowlist.")
+    try:
+        addr_info = socket.getaddrinfo(hostname, parsed_url.port or 443, type=socket.SOCK_STREAM)
+    except socket.gaierror as exc:
+        raise UrlIngestionError(f"Could not resolve host: {hostname}") from exc
+    for _family, _socktype, _proto, _canonname, sockaddr in addr_info:
+        raw_ip = sockaddr[0]
+        ip = ipaddress.ip_address(raw_ip)
+        if (
+            ip.is_private
+            or ip.is_loopback
+            or ip.is_link_local
+            or ip.is_multicast
+            or ip.is_reserved
+            or ip.is_unspecified
+        ):
+            raise UrlIngestionError("URL resolves to a blocked private or non-routable address.")
+def _sanitize_filename(url: str, title: str | None) -> str:
+    parsed = urlparse(url)
+    stem = title or Path(parsed.path).stem or parsed.hostname or "url-document"
+    cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", " "} else "-" for ch in stem)
+    cleaned = "-".join(cleaned.split()).strip("-_")
+    cleaned = cleaned[:80] or "url-document"
+    return f"{cleaned}.md"
+def _render_markdown_from_remote(source_url: str, title: str | None, body: str) -> str:
+    parts = []
+    if title:
+        parts.append(f"# {title}")
+    parts.append(f"Source URL: {source_url}")
+    parts.append("")
+    parts.append(body.strip())
+    return "\n".join(parts).strip() + "\n"
+def _extract_text_payload(content_type: str, raw_bytes: bytes) -> tuple[str | None, str]:
+    lowered = content_type.lower()
+    decoded = raw_bytes.decode("utf-8", errors="replace")
+    if lowered.startswith("text/html") or "application/xhtml+xml" in lowered:
+        parser = _VisibleTextParser()
+        parser.feed(decoded)
+        text = parser.text
+        if not text.strip():
+            raise UrlIngestionError("Fetched HTML did not contain readable text.")
+        return parser.title, text
+    text = decoded.strip()
+    if not text:
+        raise UrlIngestionError("Fetched URL returned empty text.")
+    return None, text
+def fetch_url_to_tempfile(url: str, *, label: str | None = None) -> FetchedUrlDocument:
+    parsed = urlparse((url or "").strip())
+    _ensure_safe_host(parsed)
+    with requests.Session() as session:
+        response = session.get(
+            parsed.geturl(),
+            allow_redirects=True,
+            timeout=config.URL_INGEST_TIMEOUT_S,
+            stream=True,
+            headers={"User-Agent": config.URL_INGEST_USER_AGENT},
+        )
+    try:
+        response.raise_for_status()
+    except requests.HTTPError as exc:
+        raise UrlIngestionError(f"URL fetch failed with status {response.status_code}.") from exc
+    final_url = response.url or parsed.geturl()
+    _ensure_safe_host(urlparse(final_url))
+    content_type = (response.headers.get("content-type") or "").split(";", 1)[0].strip().lower()
+    if content_type not in config.URL_INGEST_ALLOWED_CONTENT_TYPES:
+        raise UrlIngestionError(f"Unsupported URL content type: {content_type or 'unknown'}.")
+    declared_length = response.headers.get("content-length")
+    if declared_length:
+        try:
+            if int(declared_length) > config.URL_INGEST_MAX_BYTES:
+                raise UrlIngestionError("Remote URL response exceeds the configured size limit.")
+        except ValueError:
+            pass
+    body = bytearray()
+    for chunk in response.iter_content(chunk_size=65536):
+        if not chunk:
+            continue
+        body.extend(chunk)
+        if len(body) > config.URL_INGEST_MAX_BYTES:
+            raise UrlIngestionError("Remote URL response exceeds the configured size limit.")
+    title, text = _extract_text_payload(content_type, bytes(body))
+    rendered = _render_markdown_from_remote(final_url, label or title, text)
+    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".md", prefix="morpheus_url_")
+    os.close(tmp_fd)
+    with open(tmp_path, "w", encoding="utf-8") as handle:
+        handle.write(rendered)
+    return FetchedUrlDocument(
+        source_url=parsed.geturl(),
+        final_url=final_url,
+        filename=_sanitize_filename(final_url, label or title),
+        temp_path=tmp_path,
+        content_type=content_type,
+        content_bytes=len(body),
+        title=label or title,
+    )

docs/operations_playbook.md CHANGED Viewed

@@ -21,6 +21,7 @@
 ## Graph Workflow
 - Use Graph view for summary, search, path, and export
 - Code indexing is operator-only and restricted to allowed roots
 - Keep PDF answer-first flow unchanged while graph features expand
 ## Admin Review Backends

 ## Graph Workflow
 - Use Graph view for summary, search, path, and export
 - Code indexing is operator-only and restricted to allowed roots
+- URL ingestion is operator-only and restricted by host allowlist plus public-IP checks
 - Keep PDF answer-first flow unchanged while graph features expand
 ## Admin Review Backends

docs/release_checklist.md CHANGED Viewed

@@ -22,9 +22,11 @@
 - Admin review loads traces and evaluation datasets
 - Admin draft review works with the configured provider (`ollama` or NVIDIA hosted)
 - Graph summary/search/export endpoints return tenant-scoped data
 ## Operator Checks
 - Confirm reviewed eval rows and active eval rows are non-zero
 - Confirm `query_traces` are still recording version metadata
 - Confirm cache invalidation by feedback still works
 - Confirm graph runs appear after code indexing

 - Admin review loads traces and evaluation datasets
 - Admin draft review works with the configured provider (`ollama` or NVIDIA hosted)
 - Graph summary/search/export endpoints return tenant-scoped data
+- Operator URL ingest accepts only safe public hosts and queues background ingestion
 ## Operator Checks
 - Confirm reviewed eval rows and active eval rows are non-zero
 - Confirm `query_traces` are still recording version metadata
 - Confirm cache invalidation by feedback still works
 - Confirm graph runs appear after code indexing
+- Confirm URL ingest respects allowlisted hosts and blocks private-network targets

frontend/index.html CHANGED Viewed

@@ -503,6 +503,13 @@
               <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
                 <button class="btn-primary" onclick="runOperatorCodeIndex()">INDEX PYTHON CODEBASE</button>
               </div>
             </div>
           </div>
         </div>

               <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
                 <button class="btn-primary" onclick="runOperatorCodeIndex()">INDEX PYTHON CODEBASE</button>
               </div>
+              <div class="section-label" style="margin-top:16px;">Operator URL Ingest</div>
+              <div class="confirm-zone" id="graphUrlOperatorHelp">URL ingestion is unavailable.</div>
+              <input type="text" id="graphUrlInput" placeholder="https://example.com/docs/page" style="margin-top:10px;" />
+              <input type="text" id="graphUrlLabel" placeholder="Optional display label…" style="margin-top:8px;" />
+              <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
+                <button class="btn-primary" onclick="runOperatorUrlIngest()">INGEST URL</button>
+              </div>
             </div>
           </div>
         </div>

frontend/js/api.js CHANGED Viewed

@@ -212,6 +212,17 @@ async function apiAdminIndexCodeGraph(adminKey, payload) {
   });
 }
 // ── Corpus ────────────────────────────────────────────────────────────────────
 async function apiLoadFiles() {
   return apiFetch('/api/v1/corpus/files');

   });
 }
+async function apiAdminGetUrlIngestOptions(adminKey) {
+  return apiAdminFetch('/api/v1/admin/graph/url-ingest/options', adminKey);
+}
+async function apiAdminIngestUrl(adminKey, payload) {
+  return apiAdminFetch('/api/v1/admin/graph/url-ingest', adminKey, {
+    method: 'POST',
+    body: JSON.stringify(payload),
+  });
+}
 // ── Corpus ────────────────────────────────────────────────────────────────────
 async function apiLoadFiles() {
   return apiFetch('/api/v1/corpus/files');

frontend/js/graph.js CHANGED Viewed

@@ -4,7 +4,7 @@
  * - `corpus` mode keeps the original category/document force graph
  * - `hybrid` mode renders persisted graph nodes/edges from `/api/v1/graph/*`
  * - the right-hand Graph view is a minimal explorer for summary, search, path,
- *   export, and admin-only code indexing
  */
 const GRAPH_KIND_COLORS = {
@@ -477,21 +477,31 @@ async function refreshGraphWorkspace() {
 async function refreshGraphOperatorState() {
   const card = document.getElementById('graphOperatorCard');
   const help = document.getElementById('graphOperatorHelp');
   if (!card || !help) return;
   if (!STATE.adminUnlocked || !STATE.adminKey) {
     card.style.display = 'none';
     return;
   }
-  const options = await apiAdminGetCodeGraphOptions(STATE.adminKey);
   card.style.display = '';
   STATE.graphAllowedRoots = options.allowed_roots || [];
   STATE.graphIndexDefaultRoot = options.default_root || '';
   if (document.getElementById('graphIndexRoot') && !document.getElementById('graphIndexRoot').value) {
     document.getElementById('graphIndexRoot').value = STATE.graphIndexDefaultRoot || '';
   }
   help.innerHTML = STATE.graphAllowedRoots.length
     ? `Allowed roots:<br>${STATE.graphAllowedRoots.map(root => `<code>${esc(root)}</code>`).join('<br>')}`
     : 'No operator code roots are configured.';
 }
 async function runGraphSearch() {
@@ -560,6 +570,24 @@ async function runOperatorCodeIndex() {
   await refreshGraphWorkspace();
 }
 function downloadGraphExport() {
   const snapshot = STATE.graphSnapshot;
   if (!snapshot) {
@@ -666,6 +694,7 @@ window.refreshGraphOperatorState = refreshGraphOperatorState;
 window.runGraphSearch = runGraphSearch;
 window.runGraphPath = runGraphPath;
 window.runOperatorCodeIndex = runOperatorCodeIndex;
 window.downloadGraphExport = downloadGraphExport;
 window.selectGraphNode = selectGraphNode;
 window.selectGraphNodeByKey = selectGraphNodeByKey;

  * - `corpus` mode keeps the original category/document force graph
  * - `hybrid` mode renders persisted graph nodes/edges from `/api/v1/graph/*`
  * - the right-hand Graph view is a minimal explorer for summary, search, path,
+ *   export, and admin-only code indexing / URL ingestion
  */
 const GRAPH_KIND_COLORS = {
 async function refreshGraphOperatorState() {
   const card = document.getElementById('graphOperatorCard');
   const help = document.getElementById('graphOperatorHelp');
+  const urlHelp = document.getElementById('graphUrlOperatorHelp');
   if (!card || !help) return;
   if (!STATE.adminUnlocked || !STATE.adminKey) {
     card.style.display = 'none';
     return;
   }
+  const [options, urlOptions] = await Promise.all([
+    apiAdminGetCodeGraphOptions(STATE.adminKey),
+    apiAdminGetUrlIngestOptions(STATE.adminKey),
+  ]);
   card.style.display = '';
   STATE.graphAllowedRoots = options.allowed_roots || [];
   STATE.graphIndexDefaultRoot = options.default_root || '';
+  STATE.graphAllowedHosts = urlOptions.allowed_hosts || [];
   if (document.getElementById('graphIndexRoot') && !document.getElementById('graphIndexRoot').value) {
     document.getElementById('graphIndexRoot').value = STATE.graphIndexDefaultRoot || '';
   }
   help.innerHTML = STATE.graphAllowedRoots.length
     ? `Allowed roots:<br>${STATE.graphAllowedRoots.map(root => `<code>${esc(root)}</code>`).join('<br>')}`
     : 'No operator code roots are configured.';
+  if (urlHelp) {
+    urlHelp.innerHTML = STATE.graphAllowedHosts.length
+      ? `Allowed hosts:<br>${STATE.graphAllowedHosts.map(host => `<code>${esc(host)}</code>`).join('<br>')}`
+      : 'No host allowlist is configured. URL ingestion still blocks private and non-routable hosts.';
+  }
 }
 async function runGraphSearch() {
   await refreshGraphWorkspace();
 }
+async function runOperatorUrlIngest() {
+  if (!STATE.adminUnlocked || !STATE.adminKey) {
+    toast('Unlock operator tools first.', 'error');
+    return;
+  }
+  const url = document.getElementById('graphUrlInput')?.value?.trim() || '';
+  const label = document.getElementById('graphUrlLabel')?.value?.trim() || '';
+  if (!url) {
+    toast('Enter a URL to ingest first.', 'error');
+    return;
+  }
+  const result = await apiAdminIngestUrl(STATE.adminKey, {
+    url,
+    label: label || null,
+  });
+  toast(`Queued URL ingestion for ${result.final_url || result.source_url}.`, 'success');
+}
 function downloadGraphExport() {
   const snapshot = STATE.graphSnapshot;
   if (!snapshot) {
 window.runGraphSearch = runGraphSearch;
 window.runGraphPath = runGraphPath;
 window.runOperatorCodeIndex = runOperatorCodeIndex;
+window.runOperatorUrlIngest = runOperatorUrlIngest;
 window.downloadGraphExport = downloadGraphExport;
 window.selectGraphNode = selectGraphNode;
 window.selectGraphNodeByKey = selectGraphNodeByKey;

frontend/js/state.js CHANGED Viewed

@@ -36,6 +36,7 @@ const STATE = {
   graphSourceKind: '',
   graphAllowedRoots: [],
   graphIndexDefaultRoot: '',
 };
 function stateRefreshCategories() {

   graphSourceKind: '',
   graphAllowedRoots: [],
   graphIndexDefaultRoot: '',
+  graphAllowedHosts: [],
 };
 function stateRefreshCategories() {

tests/test_url_ingestion.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from pathlib import Path
+import shutil
+from types import SimpleNamespace
+import pytest
+from backend.api import admin as admin_api
+from backend.core import pipeline
+from backend.core import url_ingestion
+class _FakeTask:
+    id = "task-123"
+class _FakeResponse:
+    def __init__(self, *, url: str, content_type: str, body: bytes, headers=None, status_code: int = 200):
+        self.url = url
+        self._body = body
+        self.headers = {"content-type": content_type, **(headers or {})}
+        self.status_code = status_code
+    def raise_for_status(self):
+        if self.status_code >= 400:
+            raise url_ingestion.requests.HTTPError(f"HTTP {self.status_code}")
+    def iter_content(self, chunk_size=65536):
+        for start in range(0, len(self._body), chunk_size):
+            yield self._body[start:start + chunk_size]
+class _FakeSession:
+    def __init__(self, response):
+        self._response = response
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc, tb):
+        return False
+    def get(self, *args, **kwargs):
+        del args, kwargs
+        return self._response
+def test_fetch_url_to_tempfile_blocks_private_ip(monkeypatch):
+    monkeypatch.setattr(url_ingestion.config, "URL_INGEST_ALLOWED_HOSTS", [])
+    monkeypatch.setattr(
+        url_ingestion.socket,
+        "getaddrinfo",
+        lambda *args, **kwargs: [(None, None, None, None, ("127.0.0.1", 443))],
+    )
+    with pytest.raises(url_ingestion.UrlIngestionError):
+        url_ingestion.fetch_url_to_tempfile("https://example.com/internal")
+def test_fetch_url_to_tempfile_extracts_readable_html(monkeypatch):
+    body = b"<html><head><title>Morpheus Docs</title></head><body><main><h1>Overview</h1><p>Hybrid retrieval works.</p></main></body></html>"
+    monkeypatch.setattr(url_ingestion.config, "URL_INGEST_ALLOWED_HOSTS", ["docs.example.com"])
+    monkeypatch.setattr(
+        url_ingestion.socket,
+        "getaddrinfo",
+        lambda *args, **kwargs: [(None, None, None, None, ("93.184.216.34", 443))],
+    )
+    monkeypatch.setattr(
+        url_ingestion.requests,
+        "Session",
+        lambda: _FakeSession(
+            _FakeResponse(
+                url="https://docs.example.com/guide",
+                content_type="text/html; charset=utf-8",
+                body=body,
+            )
+        ),
+    )
+    result = url_ingestion.fetch_url_to_tempfile("https://docs.example.com/guide")
+    assert result.final_url == "https://docs.example.com/guide"
+    assert result.content_type == "text/html"
+    with open(result.temp_path, "r", encoding="utf-8") as handle:
+        rendered = handle.read()
+    assert "# Morpheus Docs" in rendered
+    assert "Hybrid retrieval works." in rendered
+def test_run_ingestion_with_url_override_persists_url_metadata(monkeypatch):
+    from tests.test_pipeline_regressions import FakeIngestionSupabase
+    fake_supabase = FakeIngestionSupabase()
+    captured = {}
+    root = Path("tests") / "_tmp_graph_hybrid" / "url_ingestion"
+    root.mkdir(parents=True, exist_ok=True)
+    try:
+        source_path = root / "remote.md"
+        source_path.write_text("# Remote Doc\n\nThis URL import contains enough content to ingest.\n", encoding="utf-8")
+        monkeypatch.setattr("backend.core.auth_utils.extract_jwt_sub", lambda token: "user-1")
+        monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda path: "url-hash")
+        monkeypatch.setattr(pipeline, "is_file_already_ingested", lambda file_hash, access_token=None: False)
+        monkeypatch.setattr(pipeline, "_recover_or_prepare_orphaned_upload", lambda *args, **kwargs: None)
+        monkeypatch.setattr(pipeline, "_build_supabase_client", lambda access_token=None: fake_supabase)
+        monkeypatch.setattr(pipeline, "_build_service_supabase_client", lambda: fake_supabase)
+        monkeypatch.setattr(
+            pipeline,
+            "extract_document_entities",
+            lambda *args, **kwargs: SimpleNamespace(is_allowed=True, document_type="general_document"),
+        )
+        monkeypatch.setattr(pipeline, "_build_document_tree", lambda elements: {"title": "root", "children": []})
+        monkeypatch.setattr(
+            pipeline,
+            "process_chunks",
+            lambda chunks, elements, path_for_naming, file_hash, graph_data, user_id, pdf_images, **kwargs: (
+                captured.setdefault(
+                    "docs",
+                    [SimpleNamespace(metadata={**kwargs, "source": "Remote Doc", "title": "Remote Doc"})],
+                ),
+                ["doc-1"],
+            ),
+        )
+        monkeypatch.setattr(pipeline, "build_raptor_tree", lambda docs, ids, user_id: (docs, ids))
+        monkeypatch.setattr(pipeline, "_persist_graph_foundation", lambda **kwargs: None)
+        monkeypatch.setattr(pipeline, "upload_to_supabase", lambda docs, ids, access_token=None: None)
+        monkeypatch.setattr(pipeline, "_identity_json_from_elements", lambda elements, fallback_title=None: {})
+        monkeypatch.setattr(pipeline, "_extract_pdf_title", lambda elements, filename: "Remote Doc")
+        monkeypatch.setattr(pipeline, "_log_ingestion_retry_event", lambda **kwargs: None)
+        result = pipeline.run_ingestion(
+            file_path=str(source_path),
+            original_filename="remote-doc.md",
+            access_token="token",
+            source_kind_override="url",
+            data_shape_override="hybrid",
+            parser_kind_override="url_fetch",
+        )
+        assert result["filename"] == "Remote Doc"
+        assert result["file_hash"] == "url-hash"
+        assert captured["docs"][0].metadata["source_kind"] == "url"
+        assert captured["docs"][0].metadata["data_shape"] == "hybrid"
+        assert captured["docs"][0].metadata["parser_kind"] == "url_fetch"
+        ingested_upsert = next(item for item in fake_supabase.upserts if item[0] == "ingested_files")
+        assert ingested_upsert[1]["source_kind"] == "url"
+        assert ingested_upsert[1]["parser_kind"] == "url_fetch"
+    finally:
+        shutil.rmtree(root, ignore_errors=True)
+def test_run_url_ingest_requires_admin_and_queues_task(monkeypatch):
+    monkeypatch.setattr(admin_api, "_check_admin", lambda key: None)
+    monkeypatch.setattr(admin_api.config, "URL_INGEST_ADMIN_ENABLED", True, raising=False)
+    monkeypatch.setattr(
+        admin_api,
+        "fetch_url_to_tempfile",
+        lambda url, label=None: SimpleNamespace(
+            source_url=url,
+            final_url=url,
+            filename="Morpheus-Docs.md",
+            temp_path="C:/tmp/morpheus-url.md",
+            content_type="text/html",
+            content_bytes=1234,
+            title=label or "Morpheus Docs",
+        ),
+    )
+    monkeypatch.setattr(admin_api.process_document_task, "delay", lambda *args: _FakeTask())
+    result = admin_api.run_url_ingest(
+        admin_api.UrlIngestPayload(url="https://docs.example.com/guide", label="Morpheus Docs"),
+        x_admin_key="admin",
+        x_auth_token="token",
+        user_id="user-1",
+    )
+    assert result["ok"] is True
+    assert result["task_id"] == "task-123"
+    assert result["filename"] == "Morpheus-Docs.md"