Spaces:
Running
Running
nothex commited on
Commit ·
0f5e813
1
Parent(s): 96732d9
feat: add safe operator URL ingestion
Browse files- .env.example +4 -0
- .gitignore +1 -0
- backend/api/admin.py +74 -0
- backend/core/config.py +22 -0
- backend/core/pipeline_ingestion.py +19 -4
- backend/core/tasks.py +12 -1
- backend/core/url_ingestion.py +239 -0
- docs/operations_playbook.md +1 -0
- docs/release_checklist.md +2 -0
- frontend/index.html +7 -0
- frontend/js/api.js +11 -0
- frontend/js/graph.js +31 -2
- frontend/js/state.js +1 -0
- tests/test_url_ingestion.py +178 -0
.env.example
CHANGED
|
@@ -13,6 +13,10 @@ NVIDIA_API_BASE_URL=https://integrate.api.nvidia.com/v1
|
|
| 13 |
# Admin review backend
|
| 14 |
ADMIN_REVIEW_PROVIDER=auto
|
| 15 |
ADMIN_REVIEW_MODEL=gemma4:latest
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Supabase
|
| 18 |
SUPABASE_URL=https://example.supabase.co
|
|
|
|
| 13 |
# Admin review backend
|
| 14 |
ADMIN_REVIEW_PROVIDER=auto
|
| 15 |
ADMIN_REVIEW_MODEL=gemma4:latest
|
| 16 |
+
URL_INGEST_ADMIN_ENABLED=true
|
| 17 |
+
URL_INGEST_ALLOWED_HOSTS=
|
| 18 |
+
URL_INGEST_TIMEOUT_S=12
|
| 19 |
+
URL_INGEST_MAX_BYTES=1500000
|
| 20 |
|
| 21 |
# Supabase
|
| 22 |
SUPABASE_URL=https://example.supabase.co
|
.gitignore
CHANGED
|
@@ -17,6 +17,7 @@ note_to_me.txt
|
|
| 17 |
|
| 18 |
.dual-graph/
|
| 19 |
tests/_tmp_graph_hybrid/
|
|
|
|
| 20 |
tests/_tmp_pytest*/
|
| 21 |
tests/_tmp_intent_monitor/
|
| 22 |
tests/_tmp_intent_rollback/
|
|
|
|
| 17 |
|
| 18 |
.dual-graph/
|
| 19 |
tests/_tmp_graph_hybrid/
|
| 20 |
+
tests/_tmp_url_ingestion/
|
| 21 |
tests/_tmp_pytest*/
|
| 22 |
tests/_tmp_intent_monitor/
|
| 23 |
tests/_tmp_intent_rollback/
|
backend/api/admin.py
CHANGED
|
@@ -12,6 +12,8 @@ from pydantic import BaseModel
|
|
| 12 |
from backend.core.auth_utils import require_auth_token
|
| 13 |
from backend.core import config
|
| 14 |
from backend.core.code_graph import index_python_codebase
|
|
|
|
|
|
|
| 15 |
from backend.core.warmup_classifier import warmup, warmup_cross_encoder
|
| 16 |
from backend.core.pipeline import _build_service_supabase_client
|
| 17 |
|
|
@@ -67,6 +69,11 @@ class CodeGraphIndexPayload(BaseModel):
|
|
| 67 |
label: Optional[str] = None
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def _admin_client():
|
| 71 |
return _build_service_supabase_client()
|
| 72 |
|
|
@@ -110,6 +117,11 @@ def _resolve_code_graph_root(root_path: str) -> Path:
|
|
| 110 |
)
|
| 111 |
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
def _trace_sort_key(row: dict):
|
| 114 |
return row.get("created_at") or ""
|
| 115 |
|
|
@@ -923,3 +935,65 @@ def run_code_graph_index(
|
|
| 923 |
"root_path": str(root),
|
| 924 |
"result": result,
|
| 925 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from backend.core.auth_utils import require_auth_token
|
| 13 |
from backend.core import config
|
| 14 |
from backend.core.code_graph import index_python_codebase
|
| 15 |
+
from backend.core.tasks import process_document_task
|
| 16 |
+
from backend.core.url_ingestion import UrlIngestionError, fetch_url_to_tempfile
|
| 17 |
from backend.core.warmup_classifier import warmup, warmup_cross_encoder
|
| 18 |
from backend.core.pipeline import _build_service_supabase_client
|
| 19 |
|
|
|
|
| 69 |
label: Optional[str] = None
|
| 70 |
|
| 71 |
|
| 72 |
+
class UrlIngestPayload(BaseModel):
|
| 73 |
+
url: str
|
| 74 |
+
label: Optional[str] = None
|
| 75 |
+
|
| 76 |
+
|
| 77 |
def _admin_client():
|
| 78 |
return _build_service_supabase_client()
|
| 79 |
|
|
|
|
| 117 |
)
|
| 118 |
|
| 119 |
|
| 120 |
+
def _ensure_url_ingest_enabled() -> None:
|
| 121 |
+
if not config.URL_INGEST_ADMIN_ENABLED:
|
| 122 |
+
raise HTTPException(status_code=403, detail="Operator URL ingestion is disabled.")
|
| 123 |
+
|
| 124 |
+
|
| 125 |
def _trace_sort_key(row: dict):
|
| 126 |
return row.get("created_at") or ""
|
| 127 |
|
|
|
|
| 935 |
"root_path": str(root),
|
| 936 |
"result": result,
|
| 937 |
}
|
| 938 |
+
|
| 939 |
+
|
| 940 |
+
@router.get("/graph/url-ingest/options")
|
| 941 |
+
def get_url_ingest_options(
|
| 942 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 943 |
+
user_id: str = Depends(require_auth_token),
|
| 944 |
+
):
|
| 945 |
+
_check_admin(x_admin_key)
|
| 946 |
+
return {
|
| 947 |
+
"ok": True,
|
| 948 |
+
"enabled": bool(config.URL_INGEST_ADMIN_ENABLED),
|
| 949 |
+
"allowed_hosts": list(config.URL_INGEST_ALLOWED_HOSTS),
|
| 950 |
+
"timeout_s": config.URL_INGEST_TIMEOUT_S,
|
| 951 |
+
"max_bytes": config.URL_INGEST_MAX_BYTES,
|
| 952 |
+
"user_id": user_id,
|
| 953 |
+
}
|
| 954 |
+
|
| 955 |
+
|
| 956 |
+
@router.post("/graph/url-ingest")
|
| 957 |
+
def run_url_ingest(
|
| 958 |
+
payload: UrlIngestPayload,
|
| 959 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 960 |
+
x_auth_token: str = Header(..., alias="X-Auth-Token"),
|
| 961 |
+
user_id: str = Depends(require_auth_token),
|
| 962 |
+
):
|
| 963 |
+
del user_id
|
| 964 |
+
_check_admin(x_admin_key)
|
| 965 |
+
_ensure_url_ingest_enabled()
|
| 966 |
+
try:
|
| 967 |
+
fetched = fetch_url_to_tempfile(payload.url, label=payload.label)
|
| 968 |
+
except UrlIngestionError as exc:
|
| 969 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 970 |
+
except Exception as exc:
|
| 971 |
+
log.error("Operator URL ingestion failed for %s: %s", payload.url, exc)
|
| 972 |
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 973 |
+
|
| 974 |
+
try:
|
| 975 |
+
task = process_document_task.delay(
|
| 976 |
+
fetched.temp_path,
|
| 977 |
+
fetched.filename,
|
| 978 |
+
x_auth_token,
|
| 979 |
+
"url",
|
| 980 |
+
"hybrid",
|
| 981 |
+
"url_fetch",
|
| 982 |
+
)
|
| 983 |
+
except Exception:
|
| 984 |
+
try:
|
| 985 |
+
os.unlink(fetched.temp_path)
|
| 986 |
+
except OSError:
|
| 987 |
+
log.warning("Could not remove fetched URL temp file %s", fetched.temp_path)
|
| 988 |
+
raise
|
| 989 |
+
|
| 990 |
+
return {
|
| 991 |
+
"ok": True,
|
| 992 |
+
"task_id": task.id,
|
| 993 |
+
"source_url": fetched.source_url,
|
| 994 |
+
"final_url": fetched.final_url,
|
| 995 |
+
"filename": fetched.filename,
|
| 996 |
+
"content_type": fetched.content_type,
|
| 997 |
+
"content_bytes": fetched.content_bytes,
|
| 998 |
+
"title": fetched.title,
|
| 999 |
+
}
|
backend/core/config.py
CHANGED
|
@@ -279,6 +279,12 @@ CODE_GRAPH_ADMIN_ENABLED = os.getenv("CODE_GRAPH_ADMIN_ENABLED", "true").strip()
|
|
| 279 |
"yes",
|
| 280 |
"on",
|
| 281 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
CODE_GRAPH_ALLOWED_ROOTS = [
|
| 283 |
candidate
|
| 284 |
for candidate in [
|
|
@@ -291,6 +297,22 @@ CODE_GRAPH_ALLOWED_ROOTS = [
|
|
| 291 |
]
|
| 292 |
if candidate
|
| 293 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
|
| 296 |
def get_retrieval_profile(document_type: str | None) -> dict[str, float]:
|
|
|
|
| 279 |
"yes",
|
| 280 |
"on",
|
| 281 |
}
|
| 282 |
+
URL_INGEST_ADMIN_ENABLED = os.getenv("URL_INGEST_ADMIN_ENABLED", "true").strip().lower() in {
|
| 283 |
+
"1",
|
| 284 |
+
"true",
|
| 285 |
+
"yes",
|
| 286 |
+
"on",
|
| 287 |
+
}
|
| 288 |
CODE_GRAPH_ALLOWED_ROOTS = [
|
| 289 |
candidate
|
| 290 |
for candidate in [
|
|
|
|
| 297 |
]
|
| 298 |
if candidate
|
| 299 |
]
|
| 300 |
+
URL_INGEST_ALLOWED_HOSTS = [
|
| 301 |
+
segment.strip().lower()
|
| 302 |
+
for segment in os.getenv("URL_INGEST_ALLOWED_HOSTS", "").split(",")
|
| 303 |
+
if segment.strip()
|
| 304 |
+
]
|
| 305 |
+
URL_INGEST_ALLOWED_CONTENT_TYPES = {
|
| 306 |
+
"text/html",
|
| 307 |
+
"application/xhtml+xml",
|
| 308 |
+
"text/plain",
|
| 309 |
+
"text/markdown",
|
| 310 |
+
}
|
| 311 |
+
URL_INGEST_TIMEOUT_S = float(os.getenv("URL_INGEST_TIMEOUT_S", "12"))
|
| 312 |
+
URL_INGEST_MAX_BYTES = int(os.getenv("URL_INGEST_MAX_BYTES", "1500000"))
|
| 313 |
+
URL_INGEST_USER_AGENT = os.getenv(
|
| 314 |
+
"URL_INGEST_USER_AGENT", "MorpheusBot/1.0 (+https://nothex-morpheus-rag.hf.space)"
|
| 315 |
+
).strip()
|
| 316 |
|
| 317 |
|
| 318 |
def get_retrieval_profile(document_type: str | None) -> dict[str, float]:
|
backend/core/pipeline_ingestion.py
CHANGED
|
@@ -406,6 +406,9 @@ def run_ingestion(
|
|
| 406 |
original_filename: str = None,
|
| 407 |
access_token: str = None,
|
| 408 |
pdf_path: Optional[str] = None,
|
|
|
|
|
|
|
|
|
|
| 409 |
) -> str:
|
| 410 |
"""
|
| 411 |
Ingestion orchestrator.
|
|
@@ -447,6 +450,12 @@ def run_ingestion(
|
|
| 447 |
resolved_path,
|
| 448 |
original_filename=original_filename,
|
| 449 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
log.info("=" * 50)
|
| 451 |
log.info("Starting ingestion: %s", resolved_path)
|
| 452 |
|
|
@@ -502,12 +511,12 @@ def run_ingestion(
|
|
| 502 |
access_token=access_token,
|
| 503 |
)
|
| 504 |
|
| 505 |
-
if source_kind
|
| 506 |
-
_progress(2, "Parsing
|
| 507 |
else:
|
| 508 |
_progress(2, "Partitioning PDF (OCR + layout detection)…")
|
| 509 |
stage_started = time.perf_counter()
|
| 510 |
-
if source_kind
|
| 511 |
elements = partition_markdown(resolved_path)
|
| 512 |
pdf_images = {}
|
| 513 |
else:
|
|
@@ -517,6 +526,8 @@ def run_ingestion(
|
|
| 517 |
if not elements:
|
| 518 |
if source_kind == "markdown":
|
| 519 |
raise ValueError("Markdown file appears empty or unreadable.")
|
|
|
|
|
|
|
| 520 |
raise ValueError(
|
| 521 |
"The PDF appears blank or unreadable. "
|
| 522 |
"If scanned, ensure tesseract-ocr is installed."
|
|
@@ -528,6 +539,10 @@ def run_ingestion(
|
|
| 528 |
raise ValueError(
|
| 529 |
f"Markdown file contains almost no readable text ({text_chars} chars)."
|
| 530 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
raise ValueError(
|
| 532 |
f"PDF contains almost no readable text ({text_chars} chars). "
|
| 533 |
"May be corrupted or image-only without OCR layer."
|
|
@@ -567,7 +582,7 @@ def run_ingestion(
|
|
| 567 |
|
| 568 |
_progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
|
| 569 |
stage_started = time.perf_counter()
|
| 570 |
-
if source_kind
|
| 571 |
chunks = _create_markdown_chunks(elements)
|
| 572 |
else:
|
| 573 |
chunks = pipeline_facade.create_chunks(elements, text_chars=text_chars)
|
|
|
|
| 406 |
original_filename: str = None,
|
| 407 |
access_token: str = None,
|
| 408 |
pdf_path: Optional[str] = None,
|
| 409 |
+
source_kind_override: Optional[str] = None,
|
| 410 |
+
data_shape_override: Optional[str] = None,
|
| 411 |
+
parser_kind_override: Optional[str] = None,
|
| 412 |
) -> str:
|
| 413 |
"""
|
| 414 |
Ingestion orchestrator.
|
|
|
|
| 450 |
resolved_path,
|
| 451 |
original_filename=original_filename,
|
| 452 |
)
|
| 453 |
+
if source_kind_override:
|
| 454 |
+
source_kind = str(source_kind_override).strip().lower()
|
| 455 |
+
if data_shape_override:
|
| 456 |
+
data_shape = str(data_shape_override).strip().lower()
|
| 457 |
+
if parser_kind_override:
|
| 458 |
+
parser_kind = str(parser_kind_override).strip().lower()
|
| 459 |
log.info("=" * 50)
|
| 460 |
log.info("Starting ingestion: %s", resolved_path)
|
| 461 |
|
|
|
|
| 511 |
access_token=access_token,
|
| 512 |
)
|
| 513 |
|
| 514 |
+
if source_kind in {"markdown", "url"}:
|
| 515 |
+
_progress(2, "Parsing document structure…")
|
| 516 |
else:
|
| 517 |
_progress(2, "Partitioning PDF (OCR + layout detection)…")
|
| 518 |
stage_started = time.perf_counter()
|
| 519 |
+
if source_kind in {"markdown", "url"}:
|
| 520 |
elements = partition_markdown(resolved_path)
|
| 521 |
pdf_images = {}
|
| 522 |
else:
|
|
|
|
| 526 |
if not elements:
|
| 527 |
if source_kind == "markdown":
|
| 528 |
raise ValueError("Markdown file appears empty or unreadable.")
|
| 529 |
+
if source_kind == "url":
|
| 530 |
+
raise ValueError("Fetched URL content appears empty or unreadable.")
|
| 531 |
raise ValueError(
|
| 532 |
"The PDF appears blank or unreadable. "
|
| 533 |
"If scanned, ensure tesseract-ocr is installed."
|
|
|
|
| 539 |
raise ValueError(
|
| 540 |
f"Markdown file contains almost no readable text ({text_chars} chars)."
|
| 541 |
)
|
| 542 |
+
if source_kind == "url":
|
| 543 |
+
raise ValueError(
|
| 544 |
+
f"Fetched URL content contains almost no readable text ({text_chars} chars)."
|
| 545 |
+
)
|
| 546 |
raise ValueError(
|
| 547 |
f"PDF contains almost no readable text ({text_chars} chars). "
|
| 548 |
"May be corrupted or image-only without OCR layer."
|
|
|
|
| 582 |
|
| 583 |
_progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
|
| 584 |
stage_started = time.perf_counter()
|
| 585 |
+
if source_kind in {"markdown", "url"}:
|
| 586 |
chunks = _create_markdown_chunks(elements)
|
| 587 |
else:
|
| 588 |
chunks = pipeline_facade.create_chunks(elements, text_chars=text_chars)
|
backend/core/tasks.py
CHANGED
|
@@ -54,7 +54,15 @@ def _cleanup_temp_upload(tmp_path: str) -> None:
|
|
| 54 |
log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
|
| 55 |
|
| 56 |
|
| 57 |
-
def _process_document_task_impl(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
"""
|
| 59 |
This runs in a completely separate background process!
|
| 60 |
We pass a progress_callback to run_ingestion so it can report its status.
|
|
@@ -72,6 +80,9 @@ def _process_document_task_impl(self, tmp_path: str, original_filename: str, acc
|
|
| 72 |
original_filename=original_filename,
|
| 73 |
progress_callback=update_progress,
|
| 74 |
access_token=access_token,
|
|
|
|
|
|
|
|
|
|
| 75 |
)
|
| 76 |
finally:
|
| 77 |
_cleanup_temp_upload(tmp_path)
|
|
|
|
| 54 |
log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
|
| 55 |
|
| 56 |
|
| 57 |
+
def _process_document_task_impl(
|
| 58 |
+
self,
|
| 59 |
+
tmp_path: str,
|
| 60 |
+
original_filename: str,
|
| 61 |
+
access_token: str,
|
| 62 |
+
source_kind_override: str | None = None,
|
| 63 |
+
data_shape_override: str | None = None,
|
| 64 |
+
parser_kind_override: str | None = None,
|
| 65 |
+
):
|
| 66 |
"""
|
| 67 |
This runs in a completely separate background process!
|
| 68 |
We pass a progress_callback to run_ingestion so it can report its status.
|
|
|
|
| 80 |
original_filename=original_filename,
|
| 81 |
progress_callback=update_progress,
|
| 82 |
access_token=access_token,
|
| 83 |
+
source_kind_override=source_kind_override,
|
| 84 |
+
data_shape_override=data_shape_override,
|
| 85 |
+
parser_kind_override=parser_kind_override,
|
| 86 |
)
|
| 87 |
finally:
|
| 88 |
_cleanup_temp_upload(tmp_path)
|
backend/core/url_ingestion.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Operator-safe URL ingestion helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from html.parser import HTMLParser
|
| 7 |
+
import ipaddress
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import socket
|
| 11 |
+
import tempfile
|
| 12 |
+
from typing import Iterable
|
| 13 |
+
from urllib.parse import urlparse
|
| 14 |
+
|
| 15 |
+
import requests
|
| 16 |
+
|
| 17 |
+
from backend.core import config
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class UrlIngestionError(ValueError):
|
| 21 |
+
"""Raised when a URL cannot be fetched safely for ingestion."""
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class FetchedUrlDocument:
|
| 26 |
+
source_url: str
|
| 27 |
+
final_url: str
|
| 28 |
+
filename: str
|
| 29 |
+
temp_path: str
|
| 30 |
+
content_type: str
|
| 31 |
+
content_bytes: int
|
| 32 |
+
title: str | None = None
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class _VisibleTextParser(HTMLParser):
|
| 36 |
+
def __init__(self) -> None:
|
| 37 |
+
super().__init__()
|
| 38 |
+
self._skip_depth = 0
|
| 39 |
+
self._parts: list[str] = []
|
| 40 |
+
self._title_parts: list[str] = []
|
| 41 |
+
self._inside_title = False
|
| 42 |
+
|
| 43 |
+
def handle_starttag(self, tag: str, attrs) -> None: # noqa: ANN001
|
| 44 |
+
del attrs
|
| 45 |
+
if tag in {"script", "style", "noscript"}:
|
| 46 |
+
self._skip_depth += 1
|
| 47 |
+
if tag == "title":
|
| 48 |
+
self._inside_title = True
|
| 49 |
+
|
| 50 |
+
def handle_endtag(self, tag: str) -> None:
|
| 51 |
+
if tag in {"script", "style", "noscript"} and self._skip_depth > 0:
|
| 52 |
+
self._skip_depth -= 1
|
| 53 |
+
if tag == "title":
|
| 54 |
+
self._inside_title = False
|
| 55 |
+
if tag in {"p", "div", "section", "article", "li", "br", "h1", "h2", "h3", "h4"}:
|
| 56 |
+
self._parts.append("\n")
|
| 57 |
+
|
| 58 |
+
def handle_data(self, data: str) -> None:
|
| 59 |
+
if self._skip_depth > 0:
|
| 60 |
+
return
|
| 61 |
+
text = " ".join(data.split())
|
| 62 |
+
if not text:
|
| 63 |
+
return
|
| 64 |
+
if self._inside_title:
|
| 65 |
+
self._title_parts.append(text)
|
| 66 |
+
self._parts.append(text)
|
| 67 |
+
|
| 68 |
+
@property
|
| 69 |
+
def title(self) -> str | None:
|
| 70 |
+
title = " ".join(self._title_parts).strip()
|
| 71 |
+
return title or None
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def text(self) -> str:
|
| 75 |
+
chunks: list[str] = []
|
| 76 |
+
for part in self._parts:
|
| 77 |
+
if part == "\n":
|
| 78 |
+
if not chunks or chunks[-1] == "\n":
|
| 79 |
+
continue
|
| 80 |
+
chunks.append("\n")
|
| 81 |
+
continue
|
| 82 |
+
if chunks and chunks[-1] not in {"\n", ""}:
|
| 83 |
+
chunks.append(" ")
|
| 84 |
+
chunks.append(part)
|
| 85 |
+
text = "".join(chunks)
|
| 86 |
+
lines = [line.strip() for line in text.splitlines()]
|
| 87 |
+
return "\n".join(line for line in lines if line)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _normalize_host(hostname: str | None) -> str:
|
| 91 |
+
if not hostname:
|
| 92 |
+
raise UrlIngestionError("URL must include a hostname.")
|
| 93 |
+
return hostname.strip().lower().rstrip(".")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _resolve_allowed_hosts() -> list[str]:
|
| 97 |
+
seen: set[str] = set()
|
| 98 |
+
allowed: list[str] = []
|
| 99 |
+
for raw in config.URL_INGEST_ALLOWED_HOSTS:
|
| 100 |
+
host = _normalize_host(raw)
|
| 101 |
+
if host in seen:
|
| 102 |
+
continue
|
| 103 |
+
seen.add(host)
|
| 104 |
+
allowed.append(host)
|
| 105 |
+
return allowed
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _host_is_allowed(hostname: str, allowed_hosts: Iterable[str]) -> bool:
|
| 109 |
+
allowed = list(allowed_hosts)
|
| 110 |
+
if not allowed:
|
| 111 |
+
return True
|
| 112 |
+
for allowed_host in allowed:
|
| 113 |
+
if hostname == allowed_host or hostname.endswith(f".{allowed_host}"):
|
| 114 |
+
return True
|
| 115 |
+
return False
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _ensure_safe_host(parsed_url) -> None: # noqa: ANN001
|
| 119 |
+
hostname = _normalize_host(parsed_url.hostname)
|
| 120 |
+
if parsed_url.username or parsed_url.password:
|
| 121 |
+
raise UrlIngestionError("Authenticated URLs are not allowed.")
|
| 122 |
+
if parsed_url.scheme not in {"http", "https"}:
|
| 123 |
+
raise UrlIngestionError("Only http and https URLs are supported.")
|
| 124 |
+
if parsed_url.port not in {None, 80, 443}:
|
| 125 |
+
raise UrlIngestionError("Only standard ports 80 and 443 are allowed.")
|
| 126 |
+
if not _host_is_allowed(hostname, _resolve_allowed_hosts()):
|
| 127 |
+
raise UrlIngestionError("Requested host is not in the URL ingestion allowlist.")
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
addr_info = socket.getaddrinfo(hostname, parsed_url.port or 443, type=socket.SOCK_STREAM)
|
| 131 |
+
except socket.gaierror as exc:
|
| 132 |
+
raise UrlIngestionError(f"Could not resolve host: {hostname}") from exc
|
| 133 |
+
|
| 134 |
+
for _family, _socktype, _proto, _canonname, sockaddr in addr_info:
|
| 135 |
+
raw_ip = sockaddr[0]
|
| 136 |
+
ip = ipaddress.ip_address(raw_ip)
|
| 137 |
+
if (
|
| 138 |
+
ip.is_private
|
| 139 |
+
or ip.is_loopback
|
| 140 |
+
or ip.is_link_local
|
| 141 |
+
or ip.is_multicast
|
| 142 |
+
or ip.is_reserved
|
| 143 |
+
or ip.is_unspecified
|
| 144 |
+
):
|
| 145 |
+
raise UrlIngestionError("URL resolves to a blocked private or non-routable address.")
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _sanitize_filename(url: str, title: str | None) -> str:
|
| 149 |
+
parsed = urlparse(url)
|
| 150 |
+
stem = title or Path(parsed.path).stem or parsed.hostname or "url-document"
|
| 151 |
+
cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", " "} else "-" for ch in stem)
|
| 152 |
+
cleaned = "-".join(cleaned.split()).strip("-_")
|
| 153 |
+
cleaned = cleaned[:80] or "url-document"
|
| 154 |
+
return f"{cleaned}.md"
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _render_markdown_from_remote(source_url: str, title: str | None, body: str) -> str:
|
| 158 |
+
parts = []
|
| 159 |
+
if title:
|
| 160 |
+
parts.append(f"# {title}")
|
| 161 |
+
parts.append(f"Source URL: {source_url}")
|
| 162 |
+
parts.append("")
|
| 163 |
+
parts.append(body.strip())
|
| 164 |
+
return "\n".join(parts).strip() + "\n"
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _extract_text_payload(content_type: str, raw_bytes: bytes) -> tuple[str | None, str]:
|
| 168 |
+
lowered = content_type.lower()
|
| 169 |
+
decoded = raw_bytes.decode("utf-8", errors="replace")
|
| 170 |
+
if lowered.startswith("text/html") or "application/xhtml+xml" in lowered:
|
| 171 |
+
parser = _VisibleTextParser()
|
| 172 |
+
parser.feed(decoded)
|
| 173 |
+
text = parser.text
|
| 174 |
+
if not text.strip():
|
| 175 |
+
raise UrlIngestionError("Fetched HTML did not contain readable text.")
|
| 176 |
+
return parser.title, text
|
| 177 |
+
text = decoded.strip()
|
| 178 |
+
if not text:
|
| 179 |
+
raise UrlIngestionError("Fetched URL returned empty text.")
|
| 180 |
+
return None, text
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def fetch_url_to_tempfile(url: str, *, label: str | None = None) -> FetchedUrlDocument:
|
| 184 |
+
parsed = urlparse((url or "").strip())
|
| 185 |
+
_ensure_safe_host(parsed)
|
| 186 |
+
|
| 187 |
+
with requests.Session() as session:
|
| 188 |
+
response = session.get(
|
| 189 |
+
parsed.geturl(),
|
| 190 |
+
allow_redirects=True,
|
| 191 |
+
timeout=config.URL_INGEST_TIMEOUT_S,
|
| 192 |
+
stream=True,
|
| 193 |
+
headers={"User-Agent": config.URL_INGEST_USER_AGENT},
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
response.raise_for_status()
|
| 198 |
+
except requests.HTTPError as exc:
|
| 199 |
+
raise UrlIngestionError(f"URL fetch failed with status {response.status_code}.") from exc
|
| 200 |
+
|
| 201 |
+
final_url = response.url or parsed.geturl()
|
| 202 |
+
_ensure_safe_host(urlparse(final_url))
|
| 203 |
+
|
| 204 |
+
content_type = (response.headers.get("content-type") or "").split(";", 1)[0].strip().lower()
|
| 205 |
+
if content_type not in config.URL_INGEST_ALLOWED_CONTENT_TYPES:
|
| 206 |
+
raise UrlIngestionError(f"Unsupported URL content type: {content_type or 'unknown'}.")
|
| 207 |
+
|
| 208 |
+
declared_length = response.headers.get("content-length")
|
| 209 |
+
if declared_length:
|
| 210 |
+
try:
|
| 211 |
+
if int(declared_length) > config.URL_INGEST_MAX_BYTES:
|
| 212 |
+
raise UrlIngestionError("Remote URL response exceeds the configured size limit.")
|
| 213 |
+
except ValueError:
|
| 214 |
+
pass
|
| 215 |
+
|
| 216 |
+
body = bytearray()
|
| 217 |
+
for chunk in response.iter_content(chunk_size=65536):
|
| 218 |
+
if not chunk:
|
| 219 |
+
continue
|
| 220 |
+
body.extend(chunk)
|
| 221 |
+
if len(body) > config.URL_INGEST_MAX_BYTES:
|
| 222 |
+
raise UrlIngestionError("Remote URL response exceeds the configured size limit.")
|
| 223 |
+
|
| 224 |
+
title, text = _extract_text_payload(content_type, bytes(body))
|
| 225 |
+
rendered = _render_markdown_from_remote(final_url, label or title, text)
|
| 226 |
+
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".md", prefix="morpheus_url_")
|
| 227 |
+
os.close(tmp_fd)
|
| 228 |
+
with open(tmp_path, "w", encoding="utf-8") as handle:
|
| 229 |
+
handle.write(rendered)
|
| 230 |
+
|
| 231 |
+
return FetchedUrlDocument(
|
| 232 |
+
source_url=parsed.geturl(),
|
| 233 |
+
final_url=final_url,
|
| 234 |
+
filename=_sanitize_filename(final_url, label or title),
|
| 235 |
+
temp_path=tmp_path,
|
| 236 |
+
content_type=content_type,
|
| 237 |
+
content_bytes=len(body),
|
| 238 |
+
title=label or title,
|
| 239 |
+
)
|
docs/operations_playbook.md
CHANGED
|
@@ -21,6 +21,7 @@
|
|
| 21 |
## Graph Workflow
|
| 22 |
- Use Graph view for summary, search, path, and export
|
| 23 |
- Code indexing is operator-only and restricted to allowed roots
|
|
|
|
| 24 |
- Keep PDF answer-first flow unchanged while graph features expand
|
| 25 |
|
| 26 |
## Admin Review Backends
|
|
|
|
| 21 |
## Graph Workflow
|
| 22 |
- Use Graph view for summary, search, path, and export
|
| 23 |
- Code indexing is operator-only and restricted to allowed roots
|
| 24 |
+
- URL ingestion is operator-only and restricted by host allowlist plus public-IP checks
|
| 25 |
- Keep PDF answer-first flow unchanged while graph features expand
|
| 26 |
|
| 27 |
## Admin Review Backends
|
docs/release_checklist.md
CHANGED
|
@@ -22,9 +22,11 @@
|
|
| 22 |
- Admin review loads traces and evaluation datasets
|
| 23 |
- Admin draft review works with the configured provider (`ollama` or NVIDIA hosted)
|
| 24 |
- Graph summary/search/export endpoints return tenant-scoped data
|
|
|
|
| 25 |
|
| 26 |
## Operator Checks
|
| 27 |
- Confirm reviewed eval rows and active eval rows are non-zero
|
| 28 |
- Confirm `query_traces` are still recording version metadata
|
| 29 |
- Confirm cache invalidation by feedback still works
|
| 30 |
- Confirm graph runs appear after code indexing
|
|
|
|
|
|
| 22 |
- Admin review loads traces and evaluation datasets
|
| 23 |
- Admin draft review works with the configured provider (`ollama` or NVIDIA hosted)
|
| 24 |
- Graph summary/search/export endpoints return tenant-scoped data
|
| 25 |
+
- Operator URL ingest accepts only safe public hosts and queues background ingestion
|
| 26 |
|
| 27 |
## Operator Checks
|
| 28 |
- Confirm reviewed eval rows and active eval rows are non-zero
|
| 29 |
- Confirm `query_traces` are still recording version metadata
|
| 30 |
- Confirm cache invalidation by feedback still works
|
| 31 |
- Confirm graph runs appear after code indexing
|
| 32 |
+
- Confirm URL ingest respects allowlisted hosts and blocks private-network targets
|
frontend/index.html
CHANGED
|
@@ -503,6 +503,13 @@
|
|
| 503 |
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
|
| 504 |
<button class="btn-primary" onclick="runOperatorCodeIndex()">INDEX PYTHON CODEBASE</button>
|
| 505 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
</div>
|
| 507 |
</div>
|
| 508 |
</div>
|
|
|
|
| 503 |
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
|
| 504 |
<button class="btn-primary" onclick="runOperatorCodeIndex()">INDEX PYTHON CODEBASE</button>
|
| 505 |
</div>
|
| 506 |
+
<div class="section-label" style="margin-top:16px;">Operator URL Ingest</div>
|
| 507 |
+
<div class="confirm-zone" id="graphUrlOperatorHelp">URL ingestion is unavailable.</div>
|
| 508 |
+
<input type="text" id="graphUrlInput" placeholder="https://example.com/docs/page" style="margin-top:10px;" />
|
| 509 |
+
<input type="text" id="graphUrlLabel" placeholder="Optional display label…" style="margin-top:8px;" />
|
| 510 |
+
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
|
| 511 |
+
<button class="btn-primary" onclick="runOperatorUrlIngest()">INGEST URL</button>
|
| 512 |
+
</div>
|
| 513 |
</div>
|
| 514 |
</div>
|
| 515 |
</div>
|
frontend/js/api.js
CHANGED
|
@@ -212,6 +212,17 @@ async function apiAdminIndexCodeGraph(adminKey, payload) {
|
|
| 212 |
});
|
| 213 |
}
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
// ── Corpus ────────────────────────────────────────────────────────────────────
|
| 216 |
async function apiLoadFiles() {
|
| 217 |
return apiFetch('/api/v1/corpus/files');
|
|
|
|
| 212 |
});
|
| 213 |
}
|
| 214 |
|
| 215 |
+
async function apiAdminGetUrlIngestOptions(adminKey) {
|
| 216 |
+
return apiAdminFetch('/api/v1/admin/graph/url-ingest/options', adminKey);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
async function apiAdminIngestUrl(adminKey, payload) {
|
| 220 |
+
return apiAdminFetch('/api/v1/admin/graph/url-ingest', adminKey, {
|
| 221 |
+
method: 'POST',
|
| 222 |
+
body: JSON.stringify(payload),
|
| 223 |
+
});
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
// ── Corpus ────────────────────────────────────────────────────────────────────
|
| 227 |
async function apiLoadFiles() {
|
| 228 |
return apiFetch('/api/v1/corpus/files');
|
frontend/js/graph.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
* - `corpus` mode keeps the original category/document force graph
|
| 5 |
* - `hybrid` mode renders persisted graph nodes/edges from `/api/v1/graph/*`
|
| 6 |
* - the right-hand Graph view is a minimal explorer for summary, search, path,
|
| 7 |
-
* export, and admin-only code indexing
|
| 8 |
*/
|
| 9 |
|
| 10 |
const GRAPH_KIND_COLORS = {
|
|
@@ -477,21 +477,31 @@ async function refreshGraphWorkspace() {
|
|
| 477 |
async function refreshGraphOperatorState() {
|
| 478 |
const card = document.getElementById('graphOperatorCard');
|
| 479 |
const help = document.getElementById('graphOperatorHelp');
|
|
|
|
| 480 |
if (!card || !help) return;
|
| 481 |
if (!STATE.adminUnlocked || !STATE.adminKey) {
|
| 482 |
card.style.display = 'none';
|
| 483 |
return;
|
| 484 |
}
|
| 485 |
-
const options = await
|
|
|
|
|
|
|
|
|
|
| 486 |
card.style.display = '';
|
| 487 |
STATE.graphAllowedRoots = options.allowed_roots || [];
|
| 488 |
STATE.graphIndexDefaultRoot = options.default_root || '';
|
|
|
|
| 489 |
if (document.getElementById('graphIndexRoot') && !document.getElementById('graphIndexRoot').value) {
|
| 490 |
document.getElementById('graphIndexRoot').value = STATE.graphIndexDefaultRoot || '';
|
| 491 |
}
|
| 492 |
help.innerHTML = STATE.graphAllowedRoots.length
|
| 493 |
? `Allowed roots:<br>${STATE.graphAllowedRoots.map(root => `<code>${esc(root)}</code>`).join('<br>')}`
|
| 494 |
: 'No operator code roots are configured.';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
}
|
| 496 |
|
| 497 |
async function runGraphSearch() {
|
|
@@ -560,6 +570,24 @@ async function runOperatorCodeIndex() {
|
|
| 560 |
await refreshGraphWorkspace();
|
| 561 |
}
|
| 562 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
function downloadGraphExport() {
|
| 564 |
const snapshot = STATE.graphSnapshot;
|
| 565 |
if (!snapshot) {
|
|
@@ -666,6 +694,7 @@ window.refreshGraphOperatorState = refreshGraphOperatorState;
|
|
| 666 |
window.runGraphSearch = runGraphSearch;
|
| 667 |
window.runGraphPath = runGraphPath;
|
| 668 |
window.runOperatorCodeIndex = runOperatorCodeIndex;
|
|
|
|
| 669 |
window.downloadGraphExport = downloadGraphExport;
|
| 670 |
window.selectGraphNode = selectGraphNode;
|
| 671 |
window.selectGraphNodeByKey = selectGraphNodeByKey;
|
|
|
|
| 4 |
* - `corpus` mode keeps the original category/document force graph
|
| 5 |
* - `hybrid` mode renders persisted graph nodes/edges from `/api/v1/graph/*`
|
| 6 |
* - the right-hand Graph view is a minimal explorer for summary, search, path,
|
| 7 |
+
* export, and admin-only code indexing / URL ingestion
|
| 8 |
*/
|
| 9 |
|
| 10 |
const GRAPH_KIND_COLORS = {
|
|
|
|
| 477 |
async function refreshGraphOperatorState() {
|
| 478 |
const card = document.getElementById('graphOperatorCard');
|
| 479 |
const help = document.getElementById('graphOperatorHelp');
|
| 480 |
+
const urlHelp = document.getElementById('graphUrlOperatorHelp');
|
| 481 |
if (!card || !help) return;
|
| 482 |
if (!STATE.adminUnlocked || !STATE.adminKey) {
|
| 483 |
card.style.display = 'none';
|
| 484 |
return;
|
| 485 |
}
|
| 486 |
+
const [options, urlOptions] = await Promise.all([
|
| 487 |
+
apiAdminGetCodeGraphOptions(STATE.adminKey),
|
| 488 |
+
apiAdminGetUrlIngestOptions(STATE.adminKey),
|
| 489 |
+
]);
|
| 490 |
card.style.display = '';
|
| 491 |
STATE.graphAllowedRoots = options.allowed_roots || [];
|
| 492 |
STATE.graphIndexDefaultRoot = options.default_root || '';
|
| 493 |
+
STATE.graphAllowedHosts = urlOptions.allowed_hosts || [];
|
| 494 |
if (document.getElementById('graphIndexRoot') && !document.getElementById('graphIndexRoot').value) {
|
| 495 |
document.getElementById('graphIndexRoot').value = STATE.graphIndexDefaultRoot || '';
|
| 496 |
}
|
| 497 |
help.innerHTML = STATE.graphAllowedRoots.length
|
| 498 |
? `Allowed roots:<br>${STATE.graphAllowedRoots.map(root => `<code>${esc(root)}</code>`).join('<br>')}`
|
| 499 |
: 'No operator code roots are configured.';
|
| 500 |
+
if (urlHelp) {
|
| 501 |
+
urlHelp.innerHTML = STATE.graphAllowedHosts.length
|
| 502 |
+
? `Allowed hosts:<br>${STATE.graphAllowedHosts.map(host => `<code>${esc(host)}</code>`).join('<br>')}`
|
| 503 |
+
: 'No host allowlist is configured. URL ingestion still blocks private and non-routable hosts.';
|
| 504 |
+
}
|
| 505 |
}
|
| 506 |
|
| 507 |
async function runGraphSearch() {
|
|
|
|
| 570 |
await refreshGraphWorkspace();
|
| 571 |
}
|
| 572 |
|
| 573 |
+
async function runOperatorUrlIngest() {
|
| 574 |
+
if (!STATE.adminUnlocked || !STATE.adminKey) {
|
| 575 |
+
toast('Unlock operator tools first.', 'error');
|
| 576 |
+
return;
|
| 577 |
+
}
|
| 578 |
+
const url = document.getElementById('graphUrlInput')?.value?.trim() || '';
|
| 579 |
+
const label = document.getElementById('graphUrlLabel')?.value?.trim() || '';
|
| 580 |
+
if (!url) {
|
| 581 |
+
toast('Enter a URL to ingest first.', 'error');
|
| 582 |
+
return;
|
| 583 |
+
}
|
| 584 |
+
const result = await apiAdminIngestUrl(STATE.adminKey, {
|
| 585 |
+
url,
|
| 586 |
+
label: label || null,
|
| 587 |
+
});
|
| 588 |
+
toast(`Queued URL ingestion for ${result.final_url || result.source_url}.`, 'success');
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
function downloadGraphExport() {
|
| 592 |
const snapshot = STATE.graphSnapshot;
|
| 593 |
if (!snapshot) {
|
|
|
|
| 694 |
window.runGraphSearch = runGraphSearch;
|
| 695 |
window.runGraphPath = runGraphPath;
|
| 696 |
window.runOperatorCodeIndex = runOperatorCodeIndex;
|
| 697 |
+
window.runOperatorUrlIngest = runOperatorUrlIngest;
|
| 698 |
window.downloadGraphExport = downloadGraphExport;
|
| 699 |
window.selectGraphNode = selectGraphNode;
|
| 700 |
window.selectGraphNodeByKey = selectGraphNodeByKey;
|
frontend/js/state.js
CHANGED
|
@@ -36,6 +36,7 @@ const STATE = {
|
|
| 36 |
graphSourceKind: '',
|
| 37 |
graphAllowedRoots: [],
|
| 38 |
graphIndexDefaultRoot: '',
|
|
|
|
| 39 |
};
|
| 40 |
|
| 41 |
function stateRefreshCategories() {
|
|
|
|
| 36 |
graphSourceKind: '',
|
| 37 |
graphAllowedRoots: [],
|
| 38 |
graphIndexDefaultRoot: '',
|
| 39 |
+
graphAllowedHosts: [],
|
| 40 |
};
|
| 41 |
|
| 42 |
function stateRefreshCategories() {
|
tests/test_url_ingestion.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import shutil
|
| 3 |
+
from types import SimpleNamespace
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from backend.api import admin as admin_api
|
| 8 |
+
from backend.core import pipeline
|
| 9 |
+
from backend.core import url_ingestion
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class _FakeTask:
|
| 13 |
+
id = "task-123"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class _FakeResponse:
|
| 17 |
+
def __init__(self, *, url: str, content_type: str, body: bytes, headers=None, status_code: int = 200):
|
| 18 |
+
self.url = url
|
| 19 |
+
self._body = body
|
| 20 |
+
self.headers = {"content-type": content_type, **(headers or {})}
|
| 21 |
+
self.status_code = status_code
|
| 22 |
+
|
| 23 |
+
def raise_for_status(self):
|
| 24 |
+
if self.status_code >= 400:
|
| 25 |
+
raise url_ingestion.requests.HTTPError(f"HTTP {self.status_code}")
|
| 26 |
+
|
| 27 |
+
def iter_content(self, chunk_size=65536):
|
| 28 |
+
for start in range(0, len(self._body), chunk_size):
|
| 29 |
+
yield self._body[start:start + chunk_size]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class _FakeSession:
|
| 33 |
+
def __init__(self, response):
|
| 34 |
+
self._response = response
|
| 35 |
+
|
| 36 |
+
def __enter__(self):
|
| 37 |
+
return self
|
| 38 |
+
|
| 39 |
+
def __exit__(self, exc_type, exc, tb):
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
def get(self, *args, **kwargs):
|
| 43 |
+
del args, kwargs
|
| 44 |
+
return self._response
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_fetch_url_to_tempfile_blocks_private_ip(monkeypatch):
|
| 48 |
+
monkeypatch.setattr(url_ingestion.config, "URL_INGEST_ALLOWED_HOSTS", [])
|
| 49 |
+
monkeypatch.setattr(
|
| 50 |
+
url_ingestion.socket,
|
| 51 |
+
"getaddrinfo",
|
| 52 |
+
lambda *args, **kwargs: [(None, None, None, None, ("127.0.0.1", 443))],
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
with pytest.raises(url_ingestion.UrlIngestionError):
|
| 56 |
+
url_ingestion.fetch_url_to_tempfile("https://example.com/internal")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def test_fetch_url_to_tempfile_extracts_readable_html(monkeypatch):
|
| 60 |
+
body = b"<html><head><title>Morpheus Docs</title></head><body><main><h1>Overview</h1><p>Hybrid retrieval works.</p></main></body></html>"
|
| 61 |
+
monkeypatch.setattr(url_ingestion.config, "URL_INGEST_ALLOWED_HOSTS", ["docs.example.com"])
|
| 62 |
+
monkeypatch.setattr(
|
| 63 |
+
url_ingestion.socket,
|
| 64 |
+
"getaddrinfo",
|
| 65 |
+
lambda *args, **kwargs: [(None, None, None, None, ("93.184.216.34", 443))],
|
| 66 |
+
)
|
| 67 |
+
monkeypatch.setattr(
|
| 68 |
+
url_ingestion.requests,
|
| 69 |
+
"Session",
|
| 70 |
+
lambda: _FakeSession(
|
| 71 |
+
_FakeResponse(
|
| 72 |
+
url="https://docs.example.com/guide",
|
| 73 |
+
content_type="text/html; charset=utf-8",
|
| 74 |
+
body=body,
|
| 75 |
+
)
|
| 76 |
+
),
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
result = url_ingestion.fetch_url_to_tempfile("https://docs.example.com/guide")
|
| 80 |
+
|
| 81 |
+
assert result.final_url == "https://docs.example.com/guide"
|
| 82 |
+
assert result.content_type == "text/html"
|
| 83 |
+
with open(result.temp_path, "r", encoding="utf-8") as handle:
|
| 84 |
+
rendered = handle.read()
|
| 85 |
+
assert "# Morpheus Docs" in rendered
|
| 86 |
+
assert "Hybrid retrieval works." in rendered
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def test_run_ingestion_with_url_override_persists_url_metadata(monkeypatch):
|
| 90 |
+
from tests.test_pipeline_regressions import FakeIngestionSupabase
|
| 91 |
+
|
| 92 |
+
fake_supabase = FakeIngestionSupabase()
|
| 93 |
+
captured = {}
|
| 94 |
+
root = Path("tests") / "_tmp_graph_hybrid" / "url_ingestion"
|
| 95 |
+
root.mkdir(parents=True, exist_ok=True)
|
| 96 |
+
try:
|
| 97 |
+
source_path = root / "remote.md"
|
| 98 |
+
source_path.write_text("# Remote Doc\n\nThis URL import contains enough content to ingest.\n", encoding="utf-8")
|
| 99 |
+
|
| 100 |
+
monkeypatch.setattr("backend.core.auth_utils.extract_jwt_sub", lambda token: "user-1")
|
| 101 |
+
monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda path: "url-hash")
|
| 102 |
+
monkeypatch.setattr(pipeline, "is_file_already_ingested", lambda file_hash, access_token=None: False)
|
| 103 |
+
monkeypatch.setattr(pipeline, "_recover_or_prepare_orphaned_upload", lambda *args, **kwargs: None)
|
| 104 |
+
monkeypatch.setattr(pipeline, "_build_supabase_client", lambda access_token=None: fake_supabase)
|
| 105 |
+
monkeypatch.setattr(pipeline, "_build_service_supabase_client", lambda: fake_supabase)
|
| 106 |
+
monkeypatch.setattr(
|
| 107 |
+
pipeline,
|
| 108 |
+
"extract_document_entities",
|
| 109 |
+
lambda *args, **kwargs: SimpleNamespace(is_allowed=True, document_type="general_document"),
|
| 110 |
+
)
|
| 111 |
+
monkeypatch.setattr(pipeline, "_build_document_tree", lambda elements: {"title": "root", "children": []})
|
| 112 |
+
monkeypatch.setattr(
|
| 113 |
+
pipeline,
|
| 114 |
+
"process_chunks",
|
| 115 |
+
lambda chunks, elements, path_for_naming, file_hash, graph_data, user_id, pdf_images, **kwargs: (
|
| 116 |
+
captured.setdefault(
|
| 117 |
+
"docs",
|
| 118 |
+
[SimpleNamespace(metadata={**kwargs, "source": "Remote Doc", "title": "Remote Doc"})],
|
| 119 |
+
),
|
| 120 |
+
["doc-1"],
|
| 121 |
+
),
|
| 122 |
+
)
|
| 123 |
+
monkeypatch.setattr(pipeline, "build_raptor_tree", lambda docs, ids, user_id: (docs, ids))
|
| 124 |
+
monkeypatch.setattr(pipeline, "_persist_graph_foundation", lambda **kwargs: None)
|
| 125 |
+
monkeypatch.setattr(pipeline, "upload_to_supabase", lambda docs, ids, access_token=None: None)
|
| 126 |
+
monkeypatch.setattr(pipeline, "_identity_json_from_elements", lambda elements, fallback_title=None: {})
|
| 127 |
+
monkeypatch.setattr(pipeline, "_extract_pdf_title", lambda elements, filename: "Remote Doc")
|
| 128 |
+
monkeypatch.setattr(pipeline, "_log_ingestion_retry_event", lambda **kwargs: None)
|
| 129 |
+
|
| 130 |
+
result = pipeline.run_ingestion(
|
| 131 |
+
file_path=str(source_path),
|
| 132 |
+
original_filename="remote-doc.md",
|
| 133 |
+
access_token="token",
|
| 134 |
+
source_kind_override="url",
|
| 135 |
+
data_shape_override="hybrid",
|
| 136 |
+
parser_kind_override="url_fetch",
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
assert result["filename"] == "Remote Doc"
|
| 140 |
+
assert result["file_hash"] == "url-hash"
|
| 141 |
+
assert captured["docs"][0].metadata["source_kind"] == "url"
|
| 142 |
+
assert captured["docs"][0].metadata["data_shape"] == "hybrid"
|
| 143 |
+
assert captured["docs"][0].metadata["parser_kind"] == "url_fetch"
|
| 144 |
+
ingested_upsert = next(item for item in fake_supabase.upserts if item[0] == "ingested_files")
|
| 145 |
+
assert ingested_upsert[1]["source_kind"] == "url"
|
| 146 |
+
assert ingested_upsert[1]["parser_kind"] == "url_fetch"
|
| 147 |
+
finally:
|
| 148 |
+
shutil.rmtree(root, ignore_errors=True)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def test_run_url_ingest_requires_admin_and_queues_task(monkeypatch):
|
| 152 |
+
monkeypatch.setattr(admin_api, "_check_admin", lambda key: None)
|
| 153 |
+
monkeypatch.setattr(admin_api.config, "URL_INGEST_ADMIN_ENABLED", True, raising=False)
|
| 154 |
+
monkeypatch.setattr(
|
| 155 |
+
admin_api,
|
| 156 |
+
"fetch_url_to_tempfile",
|
| 157 |
+
lambda url, label=None: SimpleNamespace(
|
| 158 |
+
source_url=url,
|
| 159 |
+
final_url=url,
|
| 160 |
+
filename="Morpheus-Docs.md",
|
| 161 |
+
temp_path="C:/tmp/morpheus-url.md",
|
| 162 |
+
content_type="text/html",
|
| 163 |
+
content_bytes=1234,
|
| 164 |
+
title=label or "Morpheus Docs",
|
| 165 |
+
),
|
| 166 |
+
)
|
| 167 |
+
monkeypatch.setattr(admin_api.process_document_task, "delay", lambda *args: _FakeTask())
|
| 168 |
+
|
| 169 |
+
result = admin_api.run_url_ingest(
|
| 170 |
+
admin_api.UrlIngestPayload(url="https://docs.example.com/guide", label="Morpheus Docs"),
|
| 171 |
+
x_admin_key="admin",
|
| 172 |
+
x_auth_token="token",
|
| 173 |
+
user_id="user-1",
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
assert result["ok"] is True
|
| 177 |
+
assert result["task_id"] == "task-123"
|
| 178 |
+
assert result["filename"] == "Morpheus-Docs.md"
|