Spaces:
Running
feat: intelligent search-based navigation for trending/popular content
Browse files- Add _infer_navigation_paths() for generic path inference based on intent
- Add _build_search_navigation_url() for search-based fallback
- Prefer search URLs for top/trending/popular queries (work without auth)
- Rewrite _extract_markdown_link_rows() with smarter content detection:
- Skip image links (![Image]) in markdown
- Extract metrics (views, likes, comments) from nearby lines
- Filter boilerplate labels and URL tokens aggressively
- Rank results by keyword relevance and engagement metrics
- Expand boilerplate filtering in _fallback_extraction_code()
- Fix alias matching in registry.py (boundary-aware matching)
- Add regression tests for alias matching
This enables generic extraction for JS-heavy sites like YouTube by using
search URLs that return content without requiring authentication.
- backend/app/api/routes/scrape.py +1090 -51
- backend/app/sites/registry.py +16 -1
- backend/tests/test_sites/test_registry.py +21 -0
- docs/reports/non-template-10-csv-json-verification.json +198 -0
- docs/test-report.md +0 -133
|
@@ -18,7 +18,7 @@ from enum import Enum
|
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import Any, AsyncGenerator
|
| 20 |
from urllib.error import HTTPError, URLError
|
| 21 |
-
from urllib.parse import quote_plus, urlparse
|
| 22 |
from urllib.request import Request, urlopen
|
| 23 |
|
| 24 |
from bs4 import BeautifulSoup
|
|
@@ -573,7 +573,7 @@ def _coerce_url_asset(asset: str) -> str | None:
|
|
| 573 |
|
| 574 |
|
| 575 |
def _discover_assets_for_query(query: str) -> list[str]:
|
| 576 |
-
"""Resolve non-URL query assets using deterministic fallbacks."""
|
| 577 |
|
| 578 |
query_l = query.lower()
|
| 579 |
if "gold" in query_l and ("price" in query_l or "trend" in query_l):
|
|
@@ -581,7 +581,38 @@ def _discover_assets_for_query(query: str) -> list[str]:
|
|
| 581 |
"https://raw.githubusercontent.com/datasets/gold-prices/master/data/monthly.csv",
|
| 582 |
"https://github.com/datasets/gold-prices",
|
| 583 |
]
|
| 584 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
|
| 586 |
|
| 587 |
async def _search_urls_with_mcp(query: str, max_results: int = 6) -> list[str]:
|
|
@@ -610,6 +641,333 @@ async def _search_urls_with_mcp(query: str, max_results: int = 6) -> list[str]:
|
|
| 610 |
await router.shutdown()
|
| 611 |
|
| 612 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
async def _discover_reddit_communities_via_search(limit: int = 25) -> list[dict[str, Any]]:
|
| 614 |
"""Discover subreddit URLs via search engine fallback."""
|
| 615 |
|
|
@@ -708,8 +1066,6 @@ async def _resolve_assets(
|
|
| 708 |
|
| 709 |
resolved: list[str] = []
|
| 710 |
discoveries: list[dict[str, Any]] = []
|
| 711 |
-
search_enabled = "mcp-search" in enabled_plugins
|
| 712 |
-
|
| 713 |
for asset in assets:
|
| 714 |
candidate = asset.strip()
|
| 715 |
if not candidate:
|
|
@@ -721,9 +1077,7 @@ async def _resolve_assets(
|
|
| 721 |
resolved.append(normalized_url)
|
| 722 |
continue
|
| 723 |
|
| 724 |
-
discovered: list[str] =
|
| 725 |
-
if search_enabled:
|
| 726 |
-
discovered = await _search_urls_with_mcp(candidate)
|
| 727 |
if not discovered:
|
| 728 |
discovered = _discover_assets_for_query(candidate)
|
| 729 |
|
|
@@ -1013,22 +1367,126 @@ def _agentic_live_llm_enabled() -> bool:
|
|
| 1013 |
return True
|
| 1014 |
|
| 1015 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1016 |
def _fallback_navigation_url(
|
| 1017 |
base_url: str,
|
| 1018 |
instructions: str,
|
| 1019 |
navigation_plan: dict[str, Any],
|
| 1020 |
) -> str:
|
| 1021 |
-
"""Derive a deterministic navigation URL using plan/template hints when LLM is unavailable.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
|
| 1023 |
normalized = _coerce_url_asset(base_url) or base_url
|
| 1024 |
if "://" not in normalized:
|
| 1025 |
normalized = f"https://{normalized}"
|
| 1026 |
-
|
|
|
|
| 1027 |
instruction_text = (instructions or "").lower()
|
|
|
|
|
|
|
| 1028 |
plan_targets = navigation_plan.get("target_urls") or []
|
| 1029 |
valid_targets = [target for target in plan_targets if isinstance(target, str) and _is_url_asset(target)]
|
| 1030 |
if valid_targets:
|
| 1031 |
-
|
|
|
|
| 1032 |
keyword_target = next(
|
| 1033 |
(
|
| 1034 |
target
|
|
@@ -1038,10 +1496,40 @@ def _fallback_navigation_url(
|
|
| 1038 |
None,
|
| 1039 |
)
|
| 1040 |
if keyword_target:
|
| 1041 |
-
return keyword_target
|
| 1042 |
-
return valid_targets[0]
|
| 1043 |
|
| 1044 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
|
| 1046 |
|
| 1047 |
def _requested_columns_from_output_instructions(output_instructions: str | None) -> list[str]:
|
|
@@ -1086,7 +1574,125 @@ def _enforce_requested_schema(
|
|
| 1086 |
return normalized_rows, requested_columns
|
| 1087 |
|
| 1088 |
|
| 1089 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1090 |
"""Build deterministic extraction code when live LLM code generation is unavailable."""
|
| 1091 |
|
| 1092 |
columns = _requested_columns_from_output_instructions(output_instructions) or [
|
|
@@ -1094,51 +1700,208 @@ def _fallback_extraction_code(output_instructions: str | None) -> str:
|
|
| 1094 |
"url",
|
| 1095 |
"content",
|
| 1096 |
]
|
|
|
|
|
|
|
| 1097 |
columns_literal = repr(columns)
|
|
|
|
|
|
|
| 1098 |
return f"""
|
| 1099 |
columns = {columns_literal}
|
|
|
|
|
|
|
| 1100 |
rows = []
|
|
|
|
| 1101 |
seen = set()
|
| 1102 |
anchors = soup.select("a[href]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1103 |
|
| 1104 |
for anchor in anchors:
|
| 1105 |
href = (anchor.get("href") or "").strip()
|
| 1106 |
text = anchor.get_text(" ", strip=True)
|
| 1107 |
if not href and not text:
|
| 1108 |
continue
|
| 1109 |
-
if href.startswith("
|
| 1110 |
-
|
| 1111 |
-
|
| 1112 |
-
|
| 1113 |
-
|
| 1114 |
-
|
| 1115 |
-
|
| 1116 |
-
path = full_href.split("://", 1)[-1]
|
| 1117 |
-
path_parts = [part for part in path.split("/") if part]
|
| 1118 |
-
if len(path_parts) >= 3:
|
| 1119 |
-
repo_owner = path_parts[1]
|
| 1120 |
-
repo_name = path_parts[2]
|
| 1121 |
|
|
|
|
|
|
|
|
|
|
| 1122 |
container = anchor.find_parent(["article", "tr", "li", "div"])
|
| 1123 |
container_text = container.get_text(" ", strip=True) if container else text
|
| 1124 |
-
|
| 1125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1126 |
|
| 1127 |
row = {{}}
|
| 1128 |
for column in columns:
|
| 1129 |
lower = column.lower()
|
| 1130 |
if lower in {{"url", "link", "href"}}:
|
| 1131 |
row[column] = full_href
|
| 1132 |
-
elif lower in {{"title", "name", "text"
|
| 1133 |
-
row[column] =
|
| 1134 |
-
elif lower in {{"
|
| 1135 |
-
row[column] =
|
|
|
|
|
|
|
| 1136 |
elif lower in {{"repo", "repository", "repo_name"}}:
|
| 1137 |
-
row[column] =
|
| 1138 |
elif lower in {{"stars", "star", "star_count"}}:
|
| 1139 |
-
row[column] =
|
| 1140 |
elif lower in {{"forks", "fork", "fork_count"}}:
|
| 1141 |
-
row[column] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1142 |
else:
|
| 1143 |
row[column] = ""
|
| 1144 |
|
|
@@ -1148,7 +1911,126 @@ for anchor in anchors:
|
|
| 1148 |
seen.add(row_key)
|
| 1149 |
|
| 1150 |
if any(value for value in row.values()):
|
| 1151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1152 |
if len(rows) >= 25:
|
| 1153 |
break
|
| 1154 |
|
|
@@ -1239,6 +2121,7 @@ URL:"""
|
|
| 1239 |
navigation_mode = "llm"
|
| 1240 |
except Exception as e:
|
| 1241 |
logger.warning("LLM navigation decision failed, using heuristic fallback: %s", e)
|
|
|
|
| 1242 |
|
| 1243 |
# Tool call: LLM navigation planning
|
| 1244 |
yield _record_step(
|
|
@@ -1550,7 +2433,10 @@ extracted_data = [
|
|
| 1550 |
|
| 1551 |
Return ONLY executable Python code, no explanations or markdown:"""
|
| 1552 |
|
| 1553 |
-
extraction_code = _fallback_extraction_code(
|
|
|
|
|
|
|
|
|
|
| 1554 |
codegen_mode = "heuristic"
|
| 1555 |
if live_llm_enabled:
|
| 1556 |
try:
|
|
@@ -1623,11 +2509,13 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1623 |
"html": nav_obs.page_html,
|
| 1624 |
"url": target_url,
|
| 1625 |
"re": re,
|
|
|
|
| 1626 |
"urlparse": urlparse,
|
| 1627 |
"BeautifulSoup": BeautifulSoup,
|
| 1628 |
"extracted_data": [], # LLM code should populate this
|
| 1629 |
}
|
| 1630 |
output_columns: list[str] = []
|
|
|
|
| 1631 |
|
| 1632 |
try:
|
| 1633 |
# Execute the LLM-generated code
|
|
@@ -1640,8 +2528,124 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1640 |
extracted_data,
|
| 1641 |
request.output_instructions,
|
| 1642 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1643 |
|
| 1644 |
-
|
|
|
|
| 1645 |
total_reward += exec_reward
|
| 1646 |
|
| 1647 |
yield _record_step(
|
|
@@ -1657,6 +2661,9 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1657 |
"tool_description": "Execute extraction code in sandbox",
|
| 1658 |
"result": {
|
| 1659 |
"items_extracted": len(extracted_data),
|
|
|
|
|
|
|
|
|
|
| 1660 |
"columns": output_columns,
|
| 1661 |
"sample": extracted_data[:2] if extracted_data else [],
|
| 1662 |
},
|
|
@@ -1679,6 +2686,8 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1679 |
extracted_data,
|
| 1680 |
request.output_instructions,
|
| 1681 |
)
|
|
|
|
|
|
|
| 1682 |
total_reward += 0.05
|
| 1683 |
|
| 1684 |
yield _record_step(
|
|
@@ -1731,24 +2740,54 @@ Return ONLY executable Python code, no explanations or markdown:"""
|
|
| 1731 |
|
| 1732 |
# Store extracted data in session
|
| 1733 |
if request.output_format == OutputFormat.CSV and extracted_data:
|
| 1734 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1735 |
output_buffer = io.StringIO()
|
| 1736 |
-
|
| 1737 |
-
|
| 1738 |
-
|
| 1739 |
-
|
| 1740 |
-
|
|
|
|
|
|
|
| 1741 |
|
| 1742 |
session["extracted_data"] = {
|
| 1743 |
"csv_output": output_buffer.getvalue(),
|
| 1744 |
-
"rows":
|
| 1745 |
-
"columns": fieldnames
|
| 1746 |
-
"row_count": len(
|
|
|
|
| 1747 |
}
|
| 1748 |
else:
|
| 1749 |
-
session
|
| 1750 |
-
|
| 1751 |
-
|
|
|
|
|
|
|
|
|
|
| 1752 |
|
| 1753 |
total_reward += 0.1
|
| 1754 |
|
|
|
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import Any, AsyncGenerator
|
| 20 |
from urllib.error import HTTPError, URLError
|
| 21 |
+
from urllib.parse import quote_plus, urljoin, urlparse
|
| 22 |
from urllib.request import Request, urlopen
|
| 23 |
|
| 24 |
from bs4 import BeautifulSoup
|
|
|
|
| 573 |
|
| 574 |
|
| 575 |
def _discover_assets_for_query(query: str) -> list[str]:
|
| 576 |
+
"""Resolve non-URL query assets using deterministic query-aware fallbacks."""
|
| 577 |
|
| 578 |
query_l = query.lower()
|
| 579 |
if "gold" in query_l and ("price" in query_l or "trend" in query_l):
|
|
|
|
| 581 |
"https://raw.githubusercontent.com/datasets/gold-prices/master/data/monthly.csv",
|
| 582 |
"https://github.com/datasets/gold-prices",
|
| 583 |
]
|
| 584 |
+
encoded = quote_plus(query)
|
| 585 |
+
# r.jina.ai provides a static, text-friendly rendering of dynamic search pages.
|
| 586 |
+
return [f"https://r.jina.ai/http://duckduckgo.com/?q={encoded}"]
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
def _fetch_text_render_markdown(url: str, timeout_seconds: int = 12) -> tuple[str, str] | None:
|
| 590 |
+
"""Fetch a URL through r.jina.ai text rendering for dynamic-page fallback extraction."""
|
| 591 |
+
|
| 592 |
+
normalized = _coerce_url_asset(url) or url
|
| 593 |
+
if "://" not in normalized:
|
| 594 |
+
normalized = f"https://{normalized}"
|
| 595 |
+
proxy_url = _apply_text_render_proxy(normalized, force=True)
|
| 596 |
+
request = Request(
|
| 597 |
+
proxy_url,
|
| 598 |
+
headers={
|
| 599 |
+
"User-Agent": (
|
| 600 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 601 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 602 |
+
"Chrome/124.0.0.0 Safari/537.36"
|
| 603 |
+
),
|
| 604 |
+
"Accept": "text/plain,text/markdown,*/*",
|
| 605 |
+
},
|
| 606 |
+
)
|
| 607 |
+
try:
|
| 608 |
+
with urlopen(request, timeout=timeout_seconds) as response:
|
| 609 |
+
payload = response.read()
|
| 610 |
+
markdown = payload.decode("utf-8", errors="replace")
|
| 611 |
+
if markdown.strip():
|
| 612 |
+
return markdown, proxy_url
|
| 613 |
+
except (HTTPError, URLError, TimeoutError, ValueError) as error:
|
| 614 |
+
logger.debug("Text-render fallback fetch failed for %s: %s", proxy_url, error)
|
| 615 |
+
return None
|
| 616 |
|
| 617 |
|
| 618 |
async def _search_urls_with_mcp(query: str, max_results: int = 6) -> list[str]:
|
|
|
|
| 641 |
await router.shutdown()
|
| 642 |
|
| 643 |
|
| 644 |
+
def _build_recovery_queries(base_url: str, instructions: str | None) -> list[str]:
|
| 645 |
+
"""Build generic discovery queries for low-relevance extraction recovery."""
|
| 646 |
+
|
| 647 |
+
normalized_url = _coerce_url_asset(base_url) or base_url
|
| 648 |
+
if "://" not in normalized_url:
|
| 649 |
+
normalized_url = f"https://{normalized_url}"
|
| 650 |
+
parsed = urlparse(normalized_url)
|
| 651 |
+
host = (parsed.hostname or "").lower()
|
| 652 |
+
|
| 653 |
+
clean_instructions = (instructions or "").strip()
|
| 654 |
+
queries: list[str] = []
|
| 655 |
+
if host and clean_instructions:
|
| 656 |
+
queries.append(f"{host} {clean_instructions}")
|
| 657 |
+
if clean_instructions:
|
| 658 |
+
queries.append(clean_instructions)
|
| 659 |
+
if host:
|
| 660 |
+
queries.append(f"{host} latest trending top")
|
| 661 |
+
|
| 662 |
+
deduped: list[str] = []
|
| 663 |
+
for query in queries:
|
| 664 |
+
normalized = query.strip()
|
| 665 |
+
if not normalized or normalized in deduped:
|
| 666 |
+
continue
|
| 667 |
+
deduped.append(normalized)
|
| 668 |
+
return deduped
|
| 669 |
+
|
| 670 |
+
|
| 671 |
+
def _extract_markdown_link_rows(
|
| 672 |
+
markdown: str,
|
| 673 |
+
source_url: str,
|
| 674 |
+
output_instructions: str | None,
|
| 675 |
+
instructions: str | None,
|
| 676 |
+
row_limit: int,
|
| 677 |
+
) -> list[dict[str, Any]]:
|
| 678 |
+
"""Extract rows from markdown content using link patterns and line analysis."""
|
| 679 |
+
|
| 680 |
+
columns = _requested_columns_from_output_instructions(output_instructions) or ["title", "link", "content"]
|
| 681 |
+
keywords = _instruction_keywords(instructions, max_keywords=8)
|
| 682 |
+
|
| 683 |
+
# Boilerplate patterns to filter out
|
| 684 |
+
boilerplate_labels = {
|
| 685 |
+
"home", "about", "contact", "contact us", "help", "search", "press",
|
| 686 |
+
"copyright", "creator", "creators", "advertise", "developers", "terms",
|
| 687 |
+
"privacy", "policy & safety", "sign in", "log in", "sign up", "register",
|
| 688 |
+
"settings", "report history", "send feedback", "learn more", "more info",
|
| 689 |
+
"test new features", "how youtube works", "nfl sunday ticket", "shorts",
|
| 690 |
+
"subscriptions", "you", "playlist", "now playing", "skip navigation",
|
| 691 |
+
}
|
| 692 |
+
boilerplate_url_tokens = (
|
| 693 |
+
"privacy", "terms", "cookie", "contact", "advertis", "copyright",
|
| 694 |
+
"policy", "press", "help", "about/", "/t/", "legal", "support",
|
| 695 |
+
"feedback", "settings", "account", "login", "signin", "signup",
|
| 696 |
+
"ServiceLogin", "accounts.google.com",
|
| 697 |
+
)
|
| 698 |
+
|
| 699 |
+
candidate_rows: list[tuple[int, dict[str, Any]]] = []
|
| 700 |
+
seen_titles: set[str] = set()
|
| 701 |
+
seen_links: set[str] = set()
|
| 702 |
+
|
| 703 |
+
# Patterns for extracting content
|
| 704 |
+
# Match markdown links like [Title](URL) but NOT image links like 
|
| 705 |
+
content_link_pattern = re.compile(r'(?<!!)\[([^\]]+)\]\((https?://[^)]+)\)')
|
| 706 |
+
# Match view counts anywhere
|
| 707 |
+
views_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*views?', re.IGNORECASE)
|
| 708 |
+
likes_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*likes?', re.IGNORECASE)
|
| 709 |
+
comments_pattern = re.compile(r'(\d+(?:[.,]\d+)?[KkMmBb]?)\s*comments?', re.IGNORECASE)
|
| 710 |
+
date_pattern = re.compile(r'\b(today|yesterday|\d+\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\s+ago)\b', re.IGNORECASE)
|
| 711 |
+
|
| 712 |
+
# Extract view counts from the entire document first, map them by line number
|
| 713 |
+
lines = markdown.split('\n')
|
| 714 |
+
line_views: dict[int, str] = {}
|
| 715 |
+
for i, line in enumerate(lines):
|
| 716 |
+
view_match = views_pattern.search(line)
|
| 717 |
+
if view_match:
|
| 718 |
+
line_views[i] = view_match.group(1)
|
| 719 |
+
|
| 720 |
+
def get_nearby_metrics(line_idx: int, window: int = 5) -> dict[str, str]:
|
| 721 |
+
"""Get metrics from nearby lines."""
|
| 722 |
+
metrics = {"views": "", "likes": "", "comments": "", "date": ""}
|
| 723 |
+
for offset in range(-window, window + 1):
|
| 724 |
+
check_idx = line_idx + offset
|
| 725 |
+
if 0 <= check_idx < len(lines):
|
| 726 |
+
check_line = lines[check_idx]
|
| 727 |
+
if not metrics["views"]:
|
| 728 |
+
m = views_pattern.search(check_line)
|
| 729 |
+
if m:
|
| 730 |
+
metrics["views"] = m.group(1)
|
| 731 |
+
if not metrics["likes"]:
|
| 732 |
+
m = likes_pattern.search(check_line)
|
| 733 |
+
if m:
|
| 734 |
+
metrics["likes"] = m.group(1)
|
| 735 |
+
if not metrics["comments"]:
|
| 736 |
+
m = comments_pattern.search(check_line)
|
| 737 |
+
if m:
|
| 738 |
+
metrics["comments"] = m.group(1)
|
| 739 |
+
if not metrics["date"]:
|
| 740 |
+
m = date_pattern.search(check_line)
|
| 741 |
+
if m:
|
| 742 |
+
metrics["date"] = m.group(1)
|
| 743 |
+
return metrics
|
| 744 |
+
|
| 745 |
+
# Process each line
|
| 746 |
+
for i, line in enumerate(lines):
|
| 747 |
+
line = line.strip()
|
| 748 |
+
if not line or len(line) < 15:
|
| 749 |
+
continue
|
| 750 |
+
|
| 751 |
+
lowered_line = line.lower()
|
| 752 |
+
|
| 753 |
+
# Skip pure navigation/boilerplate lines
|
| 754 |
+
if any(label == lowered_line for label in boilerplate_labels):
|
| 755 |
+
continue
|
| 756 |
+
|
| 757 |
+
# Find content links (not images)
|
| 758 |
+
for match in content_link_pattern.finditer(line):
|
| 759 |
+
title = match.group(1).strip()
|
| 760 |
+
link = match.group(2).strip()
|
| 761 |
+
|
| 762 |
+
# Skip image references in title
|
| 763 |
+
if title.startswith("Image ") or title.startswith("!["):
|
| 764 |
+
continue
|
| 765 |
+
|
| 766 |
+
# Skip very short titles (likely navigation)
|
| 767 |
+
if len(title) < 5:
|
| 768 |
+
continue
|
| 769 |
+
|
| 770 |
+
# Skip boilerplate titles
|
| 771 |
+
title_lower = title.lower()
|
| 772 |
+
if title_lower in boilerplate_labels:
|
| 773 |
+
continue
|
| 774 |
+
|
| 775 |
+
# Skip titles that are just "#### Something" headers without real content
|
| 776 |
+
clean_title = re.sub(r'^#+\s*', '', title).strip()
|
| 777 |
+
if not clean_title or len(clean_title) < 5:
|
| 778 |
+
continue
|
| 779 |
+
|
| 780 |
+
# Skip if already seen this title or link
|
| 781 |
+
title_normalized = clean_title.lower()[:50]
|
| 782 |
+
link_normalized = link.split('?')[0] # Remove query params for dedup
|
| 783 |
+
if title_normalized in seen_titles:
|
| 784 |
+
continue
|
| 785 |
+
if link_normalized in seen_links and "watch" in link.lower():
|
| 786 |
+
continue
|
| 787 |
+
|
| 788 |
+
# Skip boilerplate URLs
|
| 789 |
+
if any(token in link.lower() for token in boilerplate_url_tokens):
|
| 790 |
+
continue
|
| 791 |
+
|
| 792 |
+
# Get metrics from nearby lines
|
| 793 |
+
metrics = get_nearby_metrics(i)
|
| 794 |
+
|
| 795 |
+
# Calculate relevance score
|
| 796 |
+
score_text = f"{clean_title} {link}".lower()
|
| 797 |
+
keyword_score = sum(1 for kw in keywords if kw in score_text)
|
| 798 |
+
has_content_marker = any([
|
| 799 |
+
"video" in score_text,
|
| 800 |
+
"music" in score_text,
|
| 801 |
+
"official" in score_text,
|
| 802 |
+
metrics["views"],
|
| 803 |
+
metrics["likes"],
|
| 804 |
+
"watch" in link.lower(),
|
| 805 |
+
])
|
| 806 |
+
|
| 807 |
+
# Skip if no keyword match and no content markers
|
| 808 |
+
if keywords and keyword_score == 0 and not has_content_marker:
|
| 809 |
+
continue
|
| 810 |
+
|
| 811 |
+
# Build row
|
| 812 |
+
row: dict[str, Any] = {}
|
| 813 |
+
for col in columns:
|
| 814 |
+
lower_col = col.lower()
|
| 815 |
+
if lower_col in {"url", "link", "href"}:
|
| 816 |
+
row[col] = link
|
| 817 |
+
elif lower_col in {"title", "name", "text"}:
|
| 818 |
+
row[col] = clean_title[:160]
|
| 819 |
+
elif lower_col in {"content", "summary", "description"}:
|
| 820 |
+
row[col] = clean_title[:320]
|
| 821 |
+
elif lower_col in {"views", "view_count", "viewers"}:
|
| 822 |
+
row[col] = metrics["views"]
|
| 823 |
+
elif lower_col in {"likes", "like_count"}:
|
| 824 |
+
row[col] = metrics["likes"]
|
| 825 |
+
elif lower_col in {"comments", "comment_count"}:
|
| 826 |
+
row[col] = metrics["comments"]
|
| 827 |
+
elif lower_col in {"date", "date_uploaded", "date_uplaoded", "published", "uploaded"}:
|
| 828 |
+
row[col] = metrics["date"]
|
| 829 |
+
else:
|
| 830 |
+
row[col] = ""
|
| 831 |
+
|
| 832 |
+
# Track seen items
|
| 833 |
+
seen_titles.add(title_normalized)
|
| 834 |
+
seen_links.add(link_normalized)
|
| 835 |
+
|
| 836 |
+
# Calculate final score for ranking
|
| 837 |
+
quality_score = keyword_score
|
| 838 |
+
if metrics["views"]:
|
| 839 |
+
quality_score += 3
|
| 840 |
+
if metrics["likes"] or metrics["comments"]:
|
| 841 |
+
quality_score += 1
|
| 842 |
+
if "official" in title_lower:
|
| 843 |
+
quality_score += 1
|
| 844 |
+
if "watch" in link.lower():
|
| 845 |
+
quality_score += 1
|
| 846 |
+
|
| 847 |
+
candidate_rows.append((quality_score, row))
|
| 848 |
+
|
| 849 |
+
# Also look for standalone lines with view counts (sometimes titles are separate from links)
|
| 850 |
+
for i, views in line_views.items():
|
| 851 |
+
if i > 0:
|
| 852 |
+
prev_line = lines[i - 1].strip()
|
| 853 |
+
# Check if previous line might be a title
|
| 854 |
+
if len(prev_line) > 20 and not prev_line.startswith("![") and not prev_line.startswith("http"):
|
| 855 |
+
title_normalized = prev_line.lower()[:50]
|
| 856 |
+
if title_normalized not in seen_titles:
|
| 857 |
+
row = {}
|
| 858 |
+
for col in columns:
|
| 859 |
+
lower_col = col.lower()
|
| 860 |
+
if lower_col in {"title", "name", "text"}:
|
| 861 |
+
row[col] = prev_line[:160]
|
| 862 |
+
elif lower_col in {"views", "view_count", "viewers"}:
|
| 863 |
+
row[col] = views
|
| 864 |
+
elif lower_col in {"url", "link", "href"}:
|
| 865 |
+
row[col] = source_url
|
| 866 |
+
else:
|
| 867 |
+
row[col] = ""
|
| 868 |
+
seen_titles.add(title_normalized)
|
| 869 |
+
candidate_rows.append((2, row)) # Lower score for these
|
| 870 |
+
|
| 871 |
+
# Sort by score and return top rows
|
| 872 |
+
candidate_rows.sort(key=lambda x: x[0], reverse=True)
|
| 873 |
+
return [row for _, row in candidate_rows[:row_limit]]
|
| 874 |
+
|
| 875 |
+
|
| 876 |
+
def _extract_rows_from_text_render(
|
| 877 |
+
markdown: str,
|
| 878 |
+
source_url: str,
|
| 879 |
+
output_instructions: str | None,
|
| 880 |
+
instructions: str | None,
|
| 881 |
+
row_limit: int,
|
| 882 |
+
) -> tuple[list[dict[str, Any]], list[str]]:
|
| 883 |
+
"""Execute fallback extraction code against text-rendered markdown."""
|
| 884 |
+
|
| 885 |
+
columns = _requested_columns_from_output_instructions(output_instructions) or ["title", "link", "content"]
|
| 886 |
+
|
| 887 |
+
# First try dedicated markdown extraction (better for jina.ai output)
|
| 888 |
+
markdown_rows = _extract_markdown_link_rows(
|
| 889 |
+
markdown=markdown,
|
| 890 |
+
source_url=source_url,
|
| 891 |
+
output_instructions=output_instructions,
|
| 892 |
+
instructions=instructions,
|
| 893 |
+
row_limit=row_limit,
|
| 894 |
+
)
|
| 895 |
+
|
| 896 |
+
if _rows_have_signal(markdown_rows):
|
| 897 |
+
markdown_rows, _ = _enforce_requested_schema(markdown_rows, output_instructions)
|
| 898 |
+
return markdown_rows[:row_limit], columns
|
| 899 |
+
|
| 900 |
+
# Fallback to HTML-based extraction (for cases where markdown contains HTML)
|
| 901 |
+
extraction_code = _fallback_extraction_code(output_instructions, instructions)
|
| 902 |
+
sandbox_globals = {
|
| 903 |
+
"soup": BeautifulSoup(markdown, "html.parser"),
|
| 904 |
+
"html": markdown,
|
| 905 |
+
"url": source_url,
|
| 906 |
+
"re": re,
|
| 907 |
+
"urljoin": urljoin,
|
| 908 |
+
"urlparse": urlparse,
|
| 909 |
+
"BeautifulSoup": BeautifulSoup,
|
| 910 |
+
"extracted_data": [],
|
| 911 |
+
}
|
| 912 |
+
try:
|
| 913 |
+
exec(extraction_code, sandbox_globals)
|
| 914 |
+
extracted_data = sandbox_globals.get("extracted_data", [])
|
| 915 |
+
except Exception as error:
|
| 916 |
+
logger.debug("Fallback text-render extraction failed for %s: %s", source_url, error)
|
| 917 |
+
extracted_data = []
|
| 918 |
+
|
| 919 |
+
if not isinstance(extracted_data, list):
|
| 920 |
+
extracted_data = [extracted_data] if extracted_data else []
|
| 921 |
+
extracted_data, output_columns = _enforce_requested_schema(extracted_data, output_instructions)
|
| 922 |
+
extracted_data = extracted_data[:row_limit]
|
| 923 |
+
return extracted_data, output_columns or columns
|
| 924 |
+
|
| 925 |
+
|
| 926 |
+
async def _search_recovery_rows(
|
| 927 |
+
base_url: str,
|
| 928 |
+
instructions: str | None,
|
| 929 |
+
output_instructions: str | None,
|
| 930 |
+
row_limit: int,
|
| 931 |
+
) -> tuple[list[dict[str, Any]], list[str], str | None, float]:
|
| 932 |
+
"""Search-guided generic recovery for low-relevance extraction results."""
|
| 933 |
+
|
| 934 |
+
best_rows: list[dict[str, Any]] = []
|
| 935 |
+
best_columns: list[str] = []
|
| 936 |
+
best_source: str | None = None
|
| 937 |
+
best_score = 0.0
|
| 938 |
+
|
| 939 |
+
queries = _build_recovery_queries(base_url, instructions)
|
| 940 |
+
for query in queries[:3]:
|
| 941 |
+
discovered_urls = await _search_urls_with_mcp(query, max_results=8)
|
| 942 |
+
if not discovered_urls:
|
| 943 |
+
discovered_urls = _discover_assets_for_query(query)
|
| 944 |
+
|
| 945 |
+
for candidate_url in discovered_urls[:5]:
|
| 946 |
+
text_payload = _fetch_text_render_markdown(candidate_url, timeout_seconds=12)
|
| 947 |
+
if not text_payload:
|
| 948 |
+
continue
|
| 949 |
+
markdown, source_url = text_payload
|
| 950 |
+
rows, columns = _extract_rows_from_text_render(
|
| 951 |
+
markdown=markdown,
|
| 952 |
+
source_url=source_url,
|
| 953 |
+
output_instructions=output_instructions,
|
| 954 |
+
instructions=instructions,
|
| 955 |
+
row_limit=row_limit,
|
| 956 |
+
)
|
| 957 |
+
if not _rows_have_signal(rows):
|
| 958 |
+
continue
|
| 959 |
+
score = _rows_relevance_score(rows, instructions)
|
| 960 |
+
if score > best_score or (
|
| 961 |
+
abs(score - best_score) <= 0.0001 and len(rows) > len(best_rows)
|
| 962 |
+
):
|
| 963 |
+
best_rows = rows
|
| 964 |
+
best_columns = columns
|
| 965 |
+
best_source = source_url
|
| 966 |
+
best_score = score
|
| 967 |
+
|
| 968 |
+
return best_rows, best_columns, best_source, best_score
|
| 969 |
+
|
| 970 |
+
|
| 971 |
async def _discover_reddit_communities_via_search(limit: int = 25) -> list[dict[str, Any]]:
|
| 972 |
"""Discover subreddit URLs via search engine fallback."""
|
| 973 |
|
|
|
|
| 1066 |
|
| 1067 |
resolved: list[str] = []
|
| 1068 |
discoveries: list[dict[str, Any]] = []
|
|
|
|
|
|
|
| 1069 |
for asset in assets:
|
| 1070 |
candidate = asset.strip()
|
| 1071 |
if not candidate:
|
|
|
|
| 1077 |
resolved.append(normalized_url)
|
| 1078 |
continue
|
| 1079 |
|
| 1080 |
+
discovered: list[str] = await _search_urls_with_mcp(candidate, max_results=8)
|
|
|
|
|
|
|
| 1081 |
if not discovered:
|
| 1082 |
discovered = _discover_assets_for_query(candidate)
|
| 1083 |
|
|
|
|
| 1367 |
return True
|
| 1368 |
|
| 1369 |
|
| 1370 |
+
def _apply_text_render_proxy(url: str, force: bool = False) -> str:
|
| 1371 |
+
"""Optionally route a URL through a text renderer for deterministic extraction."""
|
| 1372 |
+
|
| 1373 |
+
normalized = _coerce_url_asset(url) or url
|
| 1374 |
+
if "://" not in normalized:
|
| 1375 |
+
normalized = f"https://{normalized}"
|
| 1376 |
+
|
| 1377 |
+
if normalized.startswith("https://r.jina.ai/http://") or normalized.startswith("https://r.jina.ai/https://"):
|
| 1378 |
+
return normalized
|
| 1379 |
+
if force:
|
| 1380 |
+
return f"https://r.jina.ai/http://{normalized.split('://', 1)[1]}"
|
| 1381 |
+
return normalized
|
| 1382 |
+
|
| 1383 |
+
|
| 1384 |
+
def _infer_navigation_paths(instructions: str | None) -> list[str]:
|
| 1385 |
+
"""Infer common navigation paths based on user intent - works generically across sites."""
|
| 1386 |
+
|
| 1387 |
+
if not instructions:
|
| 1388 |
+
return []
|
| 1389 |
+
|
| 1390 |
+
instruction_text = instructions.lower()
|
| 1391 |
+
paths: list[str] = []
|
| 1392 |
+
|
| 1393 |
+
# Trending/popular intent - common paths across many sites
|
| 1394 |
+
if any(token in instruction_text for token in ("trending", "popular", "top", "hot", "best")):
|
| 1395 |
+
paths.extend([
|
| 1396 |
+
"/feed/trending",
|
| 1397 |
+
"/trending",
|
| 1398 |
+
"/popular",
|
| 1399 |
+
"/explore",
|
| 1400 |
+
"/top",
|
| 1401 |
+
"/hot",
|
| 1402 |
+
"/discover",
|
| 1403 |
+
])
|
| 1404 |
+
|
| 1405 |
+
# Latest/new/recent intent
|
| 1406 |
+
if any(token in instruction_text for token in ("latest", "new", "recent", "today")):
|
| 1407 |
+
paths.extend([
|
| 1408 |
+
"/new",
|
| 1409 |
+
"/latest",
|
| 1410 |
+
"/recent",
|
| 1411 |
+
"/feed/new",
|
| 1412 |
+
])
|
| 1413 |
+
|
| 1414 |
+
# Category-specific paths based on content type mentioned
|
| 1415 |
+
if "music" in instruction_text or "song" in instruction_text:
|
| 1416 |
+
paths.extend(["/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D", "/music", "/charts"])
|
| 1417 |
+
if "video" in instruction_text:
|
| 1418 |
+
paths.extend(["/feed/trending", "/videos"])
|
| 1419 |
+
if "game" in instruction_text or "gaming" in instruction_text:
|
| 1420 |
+
paths.extend(["/gaming", "/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D"])
|
| 1421 |
+
if "news" in instruction_text:
|
| 1422 |
+
paths.extend(["/news", "/feed/news"])
|
| 1423 |
+
if "movie" in instruction_text or "film" in instruction_text:
|
| 1424 |
+
paths.extend(["/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D", "/movies"])
|
| 1425 |
+
|
| 1426 |
+
# Dedupe while preserving order
|
| 1427 |
+
seen: set[str] = set()
|
| 1428 |
+
unique_paths: list[str] = []
|
| 1429 |
+
for path in paths:
|
| 1430 |
+
if path not in seen:
|
| 1431 |
+
seen.add(path)
|
| 1432 |
+
unique_paths.append(path)
|
| 1433 |
+
|
| 1434 |
+
return unique_paths
|
| 1435 |
+
|
| 1436 |
+
|
| 1437 |
+
def _build_search_navigation_url(base_url: str, instructions: str | None) -> str | None:
|
| 1438 |
+
"""Build a search URL when direct navigation paths don't exist - generic across sites."""
|
| 1439 |
+
|
| 1440 |
+
if not instructions:
|
| 1441 |
+
return None
|
| 1442 |
+
|
| 1443 |
+
parsed = urlparse(base_url)
|
| 1444 |
+
host = (parsed.hostname or "").lower()
|
| 1445 |
+
|
| 1446 |
+
# Extract search terms from instructions
|
| 1447 |
+
keywords = _instruction_keywords(instructions, max_keywords=6)
|
| 1448 |
+
if not keywords:
|
| 1449 |
+
return None
|
| 1450 |
+
|
| 1451 |
+
query_text = "+".join(keywords)
|
| 1452 |
+
|
| 1453 |
+
# Common search URL patterns across sites (generic, not site-specific)
|
| 1454 |
+
search_patterns = [
|
| 1455 |
+
f"{parsed.scheme}://{parsed.netloc}/search?q={query_text}",
|
| 1456 |
+
f"{parsed.scheme}://{parsed.netloc}/results?search_query={query_text}",
|
| 1457 |
+
f"{parsed.scheme}://{parsed.netloc}/search?query={query_text}",
|
| 1458 |
+
f"{parsed.scheme}://{parsed.netloc}/?s={query_text}",
|
| 1459 |
+
]
|
| 1460 |
+
|
| 1461 |
+
return search_patterns[0] if search_patterns else None
|
| 1462 |
+
|
| 1463 |
+
|
| 1464 |
def _fallback_navigation_url(
|
| 1465 |
base_url: str,
|
| 1466 |
instructions: str,
|
| 1467 |
navigation_plan: dict[str, Any],
|
| 1468 |
) -> str:
|
| 1469 |
+
"""Derive a deterministic navigation URL using plan/template hints when LLM is unavailable.
|
| 1470 |
+
|
| 1471 |
+
Uses intelligent path inference that works generically across sites:
|
| 1472 |
+
1. Template target URLs (if available)
|
| 1473 |
+
2. For top/trending/popular requests: PREFER SEARCH URLs (work without auth)
|
| 1474 |
+
3. Direct path navigation as fallback
|
| 1475 |
+
"""
|
| 1476 |
|
| 1477 |
normalized = _coerce_url_asset(base_url) or base_url
|
| 1478 |
if "://" not in normalized:
|
| 1479 |
normalized = f"https://{normalized}"
|
| 1480 |
+
|
| 1481 |
+
parsed = urlparse(normalized)
|
| 1482 |
instruction_text = (instructions or "").lower()
|
| 1483 |
+
|
| 1484 |
+
# 1. Check template target URLs first (hints only)
|
| 1485 |
plan_targets = navigation_plan.get("target_urls") or []
|
| 1486 |
valid_targets = [target for target in plan_targets if isinstance(target, str) and _is_url_asset(target)]
|
| 1487 |
if valid_targets:
|
| 1488 |
+
ranked_intent = any(token in instruction_text for token in ("trending", "popular", "top", "latest"))
|
| 1489 |
+
if ranked_intent:
|
| 1490 |
keyword_target = next(
|
| 1491 |
(
|
| 1492 |
target
|
|
|
|
| 1496 |
None,
|
| 1497 |
)
|
| 1498 |
if keyword_target:
|
| 1499 |
+
return _apply_text_render_proxy(keyword_target)
|
|
|
|
| 1500 |
|
| 1501 |
+
search_intent = any(token in instruction_text for token in ("search", "query", "lookup"))
|
| 1502 |
+
if search_intent:
|
| 1503 |
+
search_target = next(
|
| 1504 |
+
(target for target in valid_targets if any(token in target.lower() for token in ("search", "query"))),
|
| 1505 |
+
None,
|
| 1506 |
+
)
|
| 1507 |
+
if search_target:
|
| 1508 |
+
return _apply_text_render_proxy(search_target)
|
| 1509 |
+
|
| 1510 |
+
# 2. For "top/trending/popular" queries, PREFER SEARCH URLs
|
| 1511 |
+
# Search results typically work without authentication and show actual content
|
| 1512 |
+
ranked_intent = any(token in instruction_text for token in ("trending", "popular", "top", "best", "music", "video"))
|
| 1513 |
+
if ranked_intent:
|
| 1514 |
+
search_url = _build_search_navigation_url(normalized, instructions)
|
| 1515 |
+
if search_url:
|
| 1516 |
+
return _apply_text_render_proxy(search_url)
|
| 1517 |
+
|
| 1518 |
+
# 3. Try direct navigation paths as fallback
|
| 1519 |
+
inferred_paths = _infer_navigation_paths(instructions)
|
| 1520 |
+
if inferred_paths:
|
| 1521 |
+
best_path = inferred_paths[0]
|
| 1522 |
+
inferred_url = f"{parsed.scheme}://{parsed.netloc}{best_path}"
|
| 1523 |
+
return _apply_text_render_proxy(inferred_url)
|
| 1524 |
+
|
| 1525 |
+
# 4. For explicit search intents, build a search URL
|
| 1526 |
+
search_intent = any(token in instruction_text for token in ("search", "find", "looking for"))
|
| 1527 |
+
if search_intent:
|
| 1528 |
+
search_url = _build_search_navigation_url(normalized, instructions)
|
| 1529 |
+
if search_url:
|
| 1530 |
+
return _apply_text_render_proxy(search_url)
|
| 1531 |
+
|
| 1532 |
+
return _apply_text_render_proxy(normalized)
|
| 1533 |
|
| 1534 |
|
| 1535 |
def _requested_columns_from_output_instructions(output_instructions: str | None) -> list[str]:
|
|
|
|
| 1574 |
return normalized_rows, requested_columns
|
| 1575 |
|
| 1576 |
|
| 1577 |
+
def _requested_row_limit(instructions: str | None, default_limit: int = 25) -> int:
|
| 1578 |
+
"""Extract a requested row limit (e.g., 'top 5') from instructions."""
|
| 1579 |
+
|
| 1580 |
+
if not instructions:
|
| 1581 |
+
return default_limit
|
| 1582 |
+
text = instructions.lower()
|
| 1583 |
+
match = re.search(r"\btop\s+(\d{1,3})\b", text) or re.search(
|
| 1584 |
+
r"\b(\d{1,3})\s+(?:rows|items|results|entries|records|repos|frameworks)\b",
|
| 1585 |
+
text,
|
| 1586 |
+
)
|
| 1587 |
+
if not match:
|
| 1588 |
+
return default_limit
|
| 1589 |
+
value = int(match.group(1))
|
| 1590 |
+
if value < 1:
|
| 1591 |
+
return default_limit
|
| 1592 |
+
return min(value, 100)
|
| 1593 |
+
|
| 1594 |
+
|
| 1595 |
+
def _instruction_keywords(instructions: str | None, max_keywords: int = 8) -> list[str]:
|
| 1596 |
+
"""Extract semantic keywords from user instructions for relevance checks."""
|
| 1597 |
+
|
| 1598 |
+
if not instructions:
|
| 1599 |
+
return []
|
| 1600 |
+
tokens = re.findall(r"[a-zA-Z]{3,}", instructions.lower())
|
| 1601 |
+
stop_words = {
|
| 1602 |
+
"get",
|
| 1603 |
+
"give",
|
| 1604 |
+
"show",
|
| 1605 |
+
"find",
|
| 1606 |
+
"extract",
|
| 1607 |
+
"with",
|
| 1608 |
+
"from",
|
| 1609 |
+
"this",
|
| 1610 |
+
"that",
|
| 1611 |
+
"what",
|
| 1612 |
+
"where",
|
| 1613 |
+
"when",
|
| 1614 |
+
"which",
|
| 1615 |
+
"return",
|
| 1616 |
+
"output",
|
| 1617 |
+
"format",
|
| 1618 |
+
"data",
|
| 1619 |
+
"list",
|
| 1620 |
+
"site",
|
| 1621 |
+
"website",
|
| 1622 |
+
"page",
|
| 1623 |
+
"entries",
|
| 1624 |
+
"results",
|
| 1625 |
+
"items",
|
| 1626 |
+
"records",
|
| 1627 |
+
"details",
|
| 1628 |
+
"about",
|
| 1629 |
+
"across",
|
| 1630 |
+
"into",
|
| 1631 |
+
"only",
|
| 1632 |
+
"please",
|
| 1633 |
+
"the",
|
| 1634 |
+
"and",
|
| 1635 |
+
}
|
| 1636 |
+
keywords: list[str] = []
|
| 1637 |
+
for token in tokens:
|
| 1638 |
+
if token in stop_words:
|
| 1639 |
+
continue
|
| 1640 |
+
if token not in keywords:
|
| 1641 |
+
keywords.append(token)
|
| 1642 |
+
if len(keywords) >= max_keywords:
|
| 1643 |
+
break
|
| 1644 |
+
return keywords
|
| 1645 |
+
|
| 1646 |
+
|
| 1647 |
+
def _rows_have_signal(rows: list[dict[str, Any]]) -> bool:
|
| 1648 |
+
"""Return True when extracted rows contain at least one non-empty value."""
|
| 1649 |
+
|
| 1650 |
+
for row in rows:
|
| 1651 |
+
if not isinstance(row, dict):
|
| 1652 |
+
continue
|
| 1653 |
+
for value in row.values():
|
| 1654 |
+
if value is None:
|
| 1655 |
+
continue
|
| 1656 |
+
if isinstance(value, str):
|
| 1657 |
+
if value.strip():
|
| 1658 |
+
return True
|
| 1659 |
+
elif value:
|
| 1660 |
+
return True
|
| 1661 |
+
return False
|
| 1662 |
+
|
| 1663 |
+
|
| 1664 |
+
def _rows_relevance_score(rows: list[dict[str, Any]], instructions: str | None) -> float:
|
| 1665 |
+
"""Score row relevance against instruction keywords (0-1)."""
|
| 1666 |
+
|
| 1667 |
+
if not rows:
|
| 1668 |
+
return 0.0
|
| 1669 |
+
keywords = _instruction_keywords(instructions, max_keywords=8)
|
| 1670 |
+
if not keywords:
|
| 1671 |
+
return 1.0
|
| 1672 |
+
|
| 1673 |
+
row_scores: list[float] = []
|
| 1674 |
+
for row in rows:
|
| 1675 |
+
if not isinstance(row, dict):
|
| 1676 |
+
continue
|
| 1677 |
+
joined = " ".join(
|
| 1678 |
+
str(value).lower()
|
| 1679 |
+
for value in row.values()
|
| 1680 |
+
if value is not None and str(value).strip()
|
| 1681 |
+
)
|
| 1682 |
+
if not joined:
|
| 1683 |
+
continue
|
| 1684 |
+
hits = sum(1 for keyword in keywords if keyword in joined)
|
| 1685 |
+
row_scores.append(hits / len(keywords))
|
| 1686 |
+
|
| 1687 |
+
if not row_scores:
|
| 1688 |
+
return 0.0
|
| 1689 |
+
|
| 1690 |
+
row_scores.sort(reverse=True)
|
| 1691 |
+
top_n = max(1, min(3, len(row_scores)))
|
| 1692 |
+
return sum(row_scores[:top_n]) / top_n
|
| 1693 |
+
|
| 1694 |
+
|
| 1695 |
+
def _fallback_extraction_code(output_instructions: str | None, instructions: str | None = None) -> str:
|
| 1696 |
"""Build deterministic extraction code when live LLM code generation is unavailable."""
|
| 1697 |
|
| 1698 |
columns = _requested_columns_from_output_instructions(output_instructions) or [
|
|
|
|
| 1700 |
"url",
|
| 1701 |
"content",
|
| 1702 |
]
|
| 1703 |
+
keywords = _instruction_keywords(instructions, max_keywords=8)
|
| 1704 |
+
category_hint = keywords[0].title() if keywords else ""
|
| 1705 |
columns_literal = repr(columns)
|
| 1706 |
+
keywords_literal = repr(keywords)
|
| 1707 |
+
category_hint_literal = repr(category_hint)
|
| 1708 |
return f"""
|
| 1709 |
columns = {columns_literal}
|
| 1710 |
+
keywords = {keywords_literal}
|
| 1711 |
+
category_hint = {category_hint_literal}
|
| 1712 |
rows = []
|
| 1713 |
+
candidate_rows = []
|
| 1714 |
seen = set()
|
| 1715 |
anchors = soup.select("a[href]")
|
| 1716 |
+
noise_fragments = [
|
| 1717 |
+
"javascript is disabled",
|
| 1718 |
+
"please enable javascript",
|
| 1719 |
+
"skip to main content",
|
| 1720 |
+
"press enter to activate",
|
| 1721 |
+
"toggle navigation",
|
| 1722 |
+
"close menu",
|
| 1723 |
+
"open menu",
|
| 1724 |
+
"cookie settings",
|
| 1725 |
+
]
|
| 1726 |
+
boilerplate_labels = {{
|
| 1727 |
+
"home",
|
| 1728 |
+
"about",
|
| 1729 |
+
"contact",
|
| 1730 |
+
"contact us",
|
| 1731 |
+
"help",
|
| 1732 |
+
"search",
|
| 1733 |
+
"press",
|
| 1734 |
+
"copyright",
|
| 1735 |
+
"creator",
|
| 1736 |
+
"creators",
|
| 1737 |
+
"advertise",
|
| 1738 |
+
"developers",
|
| 1739 |
+
"terms",
|
| 1740 |
+
"privacy",
|
| 1741 |
+
"policy & safety",
|
| 1742 |
+
"how youtube works",
|
| 1743 |
+
"test new features",
|
| 1744 |
+
"nfl sunday ticket",
|
| 1745 |
+
"sign in",
|
| 1746 |
+
"log in",
|
| 1747 |
+
"sign up",
|
| 1748 |
+
"register",
|
| 1749 |
+
"settings",
|
| 1750 |
+
"report history",
|
| 1751 |
+
"send feedback",
|
| 1752 |
+
"learn more",
|
| 1753 |
+
"more info",
|
| 1754 |
+
}}
|
| 1755 |
+
boilerplate_url_tokens = (
|
| 1756 |
+
"privacy",
|
| 1757 |
+
"terms",
|
| 1758 |
+
"cookie",
|
| 1759 |
+
"contact",
|
| 1760 |
+
"advertis",
|
| 1761 |
+
"copyright",
|
| 1762 |
+
"policy",
|
| 1763 |
+
"press",
|
| 1764 |
+
"help",
|
| 1765 |
+
"about/",
|
| 1766 |
+
"/t/",
|
| 1767 |
+
"legal",
|
| 1768 |
+
"support",
|
| 1769 |
+
"feedback",
|
| 1770 |
+
"settings",
|
| 1771 |
+
"account",
|
| 1772 |
+
"login",
|
| 1773 |
+
"signin",
|
| 1774 |
+
"signup",
|
| 1775 |
+
"creators/",
|
| 1776 |
+
"howyoutubeworks",
|
| 1777 |
+
)
|
| 1778 |
+
ranked_intent = bool(re.search(r"\\b(top|trending|popular|latest|today|best)\\b", " ".join(keywords), re.IGNORECASE))
|
| 1779 |
+
|
| 1780 |
+
def _extract_metric(text, patterns):
|
| 1781 |
+
for pattern in patterns:
|
| 1782 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 1783 |
+
if match:
|
| 1784 |
+
return match.group(1)
|
| 1785 |
+
return ""
|
| 1786 |
+
|
| 1787 |
+
def _compact(value, limit):
|
| 1788 |
+
return re.sub(r"\\s+", " ", value).strip()[:limit]
|
| 1789 |
+
|
| 1790 |
+
def _metric_numeric(raw):
|
| 1791 |
+
normalized = str(raw or "").strip().lower().replace(",", "")
|
| 1792 |
+
if not normalized:
|
| 1793 |
+
return 0.0
|
| 1794 |
+
multiplier = 1.0
|
| 1795 |
+
if normalized.endswith("k"):
|
| 1796 |
+
multiplier = 1000.0
|
| 1797 |
+
normalized = normalized[:-1]
|
| 1798 |
+
elif normalized.endswith("m"):
|
| 1799 |
+
multiplier = 1000000.0
|
| 1800 |
+
normalized = normalized[:-1]
|
| 1801 |
+
try:
|
| 1802 |
+
return float(normalized) * multiplier
|
| 1803 |
+
except ValueError:
|
| 1804 |
+
return 0.0
|
| 1805 |
|
| 1806 |
for anchor in anchors:
|
| 1807 |
href = (anchor.get("href") or "").strip()
|
| 1808 |
text = anchor.get_text(" ", strip=True)
|
| 1809 |
if not href and not text:
|
| 1810 |
continue
|
| 1811 |
+
if href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
|
| 1812 |
+
continue
|
| 1813 |
+
full_href = urljoin(url, href)
|
| 1814 |
+
if not full_href.startswith("http"):
|
| 1815 |
+
continue
|
| 1816 |
+
if full_href.count("/") <= 2:
|
| 1817 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1818 |
|
| 1819 |
+
parsed_href = urlparse(full_href)
|
| 1820 |
+
path_parts = [part for part in parsed_href.path.split("/") if part]
|
| 1821 |
+
slug_value = path_parts[-1].replace("-", " ").replace("_", " ").strip() if path_parts else ""
|
| 1822 |
container = anchor.find_parent(["article", "tr", "li", "div"])
|
| 1823 |
container_text = container.get_text(" ", strip=True) if container else text
|
| 1824 |
+
stars_value = _extract_metric(container_text, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:stars?|star)\\b"])
|
| 1825 |
+
forks_value = _extract_metric(container_text, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:forks?|fork)\\b"])
|
| 1826 |
+
views_value = _extract_metric(
|
| 1827 |
+
container_text,
|
| 1828 |
+
[r"([0-9][0-9,\\.kKmM]*)\\s*(?:views?|viewers?|watching|plays?)\\b"],
|
| 1829 |
+
)
|
| 1830 |
+
likes_value = _extract_metric(container_text, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:likes?|thumbs\\s*up)\\b"])
|
| 1831 |
+
comments_value = _extract_metric(container_text, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:comments?|replies)\\b"])
|
| 1832 |
+
date_value = _extract_metric(
|
| 1833 |
+
container_text,
|
| 1834 |
+
[
|
| 1835 |
+
r"\\b(today|yesterday|\\d+\\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\\s+ago)\\b",
|
| 1836 |
+
r"\\b(\\d{{4}}[-/]\\d{{1,2}}[-/]\\d{{1,2}})\\b",
|
| 1837 |
+
r"\\b(\\d{{1,2}}\\s+[A-Za-z]{{3,9}}\\s+\\d{{4}})\\b",
|
| 1838 |
+
],
|
| 1839 |
+
)
|
| 1840 |
+
category_from_url = ""
|
| 1841 |
+
if len(path_parts) >= 3 and path_parts[0].lower() in {"category", "tags", "topic", "topics", "genres", "genre"}:
|
| 1842 |
+
category_from_url = path_parts[1].replace("-", " ").replace("_", " ").strip().title()
|
| 1843 |
+
|
| 1844 |
+
label = (text or container_text).strip()
|
| 1845 |
+
if not label:
|
| 1846 |
+
continue
|
| 1847 |
+
lowered_label = label.lower()
|
| 1848 |
+
lowered_href = full_href.lower()
|
| 1849 |
+
if any(fragment in lowered_label for fragment in noise_fragments):
|
| 1850 |
+
continue
|
| 1851 |
+
if lowered_label in boilerplate_labels:
|
| 1852 |
+
continue
|
| 1853 |
+
if any(token in lowered_href for token in boilerplate_url_tokens):
|
| 1854 |
+
continue
|
| 1855 |
+
if len(label) > 180 or len(label.split()) > 22:
|
| 1856 |
+
continue
|
| 1857 |
+
if label.lower() in {{
|
| 1858 |
+
"main page", "home", "about", "contact", "help", "search", "read", "talk",
|
| 1859 |
+
"view source", "view history", "contents", "current events", "special pages",
|
| 1860 |
+
}}:
|
| 1861 |
+
continue
|
| 1862 |
+
|
| 1863 |
+
score_text = " ".join([label, container_text, full_href]).lower()
|
| 1864 |
+
keyword_score = sum(1 for keyword in keywords if keyword in score_text)
|
| 1865 |
+
has_engagement_metric = any([views_value, likes_value, comments_value, date_value])
|
| 1866 |
+
if keywords and keyword_score == 0 and not has_engagement_metric:
|
| 1867 |
+
continue
|
| 1868 |
+
content_text = (container_text or label).strip()
|
| 1869 |
+
lowered_content_text = content_text.lower()
|
| 1870 |
+
if (
|
| 1871 |
+
len(content_text) > 220
|
| 1872 |
+
or " menu " in lowered_content_text
|
| 1873 |
+
or "dropdown" in lowered_content_text
|
| 1874 |
+
or "press enter to" in lowered_content_text
|
| 1875 |
+
):
|
| 1876 |
+
content_text = label
|
| 1877 |
|
| 1878 |
row = {{}}
|
| 1879 |
for column in columns:
|
| 1880 |
lower = column.lower()
|
| 1881 |
if lower in {{"url", "link", "href"}}:
|
| 1882 |
row[column] = full_href
|
| 1883 |
+
elif lower in {{"title", "name", "text"}}:
|
| 1884 |
+
row[column] = _compact(label, 160)
|
| 1885 |
+
elif lower in {{"content", "summary", "description"}}:
|
| 1886 |
+
row[column] = _compact(content_text, 320)
|
| 1887 |
+
elif lower in {{"streamer", "channel", "creator", "username", "user", "owner"}}:
|
| 1888 |
+
row[column] = _compact(slug_value or label, 120)
|
| 1889 |
elif lower in {{"repo", "repository", "repo_name"}}:
|
| 1890 |
+
row[column] = path_parts[1] if len(path_parts) >= 2 else _compact(slug_value, 120)
|
| 1891 |
elif lower in {{"stars", "star", "star_count"}}:
|
| 1892 |
+
row[column] = stars_value
|
| 1893 |
elif lower in {{"forks", "fork", "fork_count"}}:
|
| 1894 |
+
row[column] = forks_value
|
| 1895 |
+
elif lower in {{"views", "view_count", "viewers", "viewer_count", "watchers", "watching"}}:
|
| 1896 |
+
row[column] = views_value
|
| 1897 |
+
elif lower in {{"likes", "like_count"}}:
|
| 1898 |
+
row[column] = likes_value
|
| 1899 |
+
elif lower in {{"comments", "comment_count"}}:
|
| 1900 |
+
row[column] = comments_value
|
| 1901 |
+
elif lower in {{"date", "date_uploaded", "date_uplaoded", "published", "uploaded", "upload_date"}}:
|
| 1902 |
+
row[column] = date_value
|
| 1903 |
+
elif lower in {{"category", "game", "topic"}}:
|
| 1904 |
+
row[column] = category_from_url or category_hint
|
| 1905 |
else:
|
| 1906 |
row[column] = ""
|
| 1907 |
|
|
|
|
| 1911 |
seen.add(row_key)
|
| 1912 |
|
| 1913 |
if any(value for value in row.values()):
|
| 1914 |
+
quality_score = keyword_score
|
| 1915 |
+
if views_value:
|
| 1916 |
+
quality_score += 2
|
| 1917 |
+
if likes_value or comments_value:
|
| 1918 |
+
quality_score += 1
|
| 1919 |
+
candidate_rows.append((quality_score, row))
|
| 1920 |
+
|
| 1921 |
+
if not candidate_rows:
|
| 1922 |
+
raw_lines = [line.strip() for line in soup.get_text("\\n").splitlines() if line and line.strip()]
|
| 1923 |
+
for line in raw_lines:
|
| 1924 |
+
if len(line) < 15:
|
| 1925 |
+
continue
|
| 1926 |
+
lowered_line = line.lower()
|
| 1927 |
+
if any(fragment in lowered_line for fragment in noise_fragments):
|
| 1928 |
+
continue
|
| 1929 |
+
if len(line) > 260:
|
| 1930 |
+
continue
|
| 1931 |
+
if lowered_line.startswith(("title:", "url source:", "markdown content:")):
|
| 1932 |
+
continue
|
| 1933 |
+
if re.match(r"^\\*\\s+\\[(all|images|videos|news|maps|shopping)\\]", lowered_line):
|
| 1934 |
+
continue
|
| 1935 |
+
if re.match(r"^\\[[^\\]]+\\]\\(https?://duckduckgo\\.com/", lowered_line):
|
| 1936 |
+
continue
|
| 1937 |
+
if lowered_line in {"privacy", "terms", "advertising", "about duckduckgo"}:
|
| 1938 |
+
continue
|
| 1939 |
+
if lowered_line.startswith("![image"):
|
| 1940 |
+
continue
|
| 1941 |
+
if lowered_line in boilerplate_labels:
|
| 1942 |
+
continue
|
| 1943 |
+
keyword_score = sum(1 for keyword in keywords if keyword in lowered_line)
|
| 1944 |
+
views_value = _extract_metric(line, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:views?|viewers?|watching|plays?)\\b"])
|
| 1945 |
+
likes_value = _extract_metric(line, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:likes?|thumbs\\s*up)\\b"])
|
| 1946 |
+
comments_value = _extract_metric(line, [r"([0-9][0-9,\\.kKmM]*)\\s*(?:comments?|replies)\\b"])
|
| 1947 |
+
date_value = _extract_metric(
|
| 1948 |
+
line,
|
| 1949 |
+
[
|
| 1950 |
+
r"\\b(today|yesterday|\\d+\\s+(?:minutes?|hours?|days?|weeks?|months?|years?)\\s+ago)\\b",
|
| 1951 |
+
r"\\b(\\d{{4}}[-/]\\d{{1,2}}[-/]\\d{{1,2}})\\b",
|
| 1952 |
+
r"\\b(\\d{{1,2}}\\s+[A-Za-z]{{3,9}}\\s+\\d{{4}})\\b",
|
| 1953 |
+
],
|
| 1954 |
+
)
|
| 1955 |
+
markdown_link_match = re.search(r"\\[([^\\]]+)\\]\\((https?://[^\\)]+)\\)", line)
|
| 1956 |
+
plain_link_match = re.search(r"https?://[^\\s\\)]+", line)
|
| 1957 |
+
if markdown_link_match:
|
| 1958 |
+
line_title = markdown_link_match.group(1).strip()
|
| 1959 |
+
line_link = markdown_link_match.group(2).strip()
|
| 1960 |
+
else:
|
| 1961 |
+
line_title = line.strip()
|
| 1962 |
+
line_link = plain_link_match.group(0).strip() if plain_link_match else url
|
| 1963 |
+
|
| 1964 |
+
if ranked_intent and keywords and keyword_score == 0 and not any([views_value, likes_value, comments_value]):
|
| 1965 |
+
continue
|
| 1966 |
+
|
| 1967 |
+
row = {{}}
|
| 1968 |
+
for column in columns:
|
| 1969 |
+
lower = column.lower()
|
| 1970 |
+
if lower in {{"url", "link", "href"}}:
|
| 1971 |
+
row[column] = line_link
|
| 1972 |
+
elif lower in {{"title", "name", "text"}}:
|
| 1973 |
+
row[column] = _compact(line_title, 160)
|
| 1974 |
+
elif lower in {{"content", "summary", "description"}}:
|
| 1975 |
+
row[column] = _compact(line, 320)
|
| 1976 |
+
elif lower in {{"streamer", "channel", "creator", "username", "user", "owner"}}:
|
| 1977 |
+
row[column] = _compact(line_title, 120)
|
| 1978 |
+
elif lower in {{"views", "view_count", "viewers", "viewer_count", "watchers", "watching"}}:
|
| 1979 |
+
row[column] = views_value
|
| 1980 |
+
elif lower in {{"likes", "like_count"}}:
|
| 1981 |
+
row[column] = likes_value
|
| 1982 |
+
elif lower in {{"comments", "comment_count"}}:
|
| 1983 |
+
row[column] = comments_value
|
| 1984 |
+
elif lower in {{"date", "date_uploaded", "date_uplaoded", "published", "uploaded", "upload_date"}}:
|
| 1985 |
+
row[column] = date_value
|
| 1986 |
+
elif lower in {{"category", "game", "topic"}}:
|
| 1987 |
+
row[column] = category_hint
|
| 1988 |
+
else:
|
| 1989 |
+
row[column] = ""
|
| 1990 |
+
row_key = tuple(row.get(column, "") for column in columns)
|
| 1991 |
+
if row_key in seen:
|
| 1992 |
+
continue
|
| 1993 |
+
seen.add(row_key)
|
| 1994 |
+
quality_score = max(keyword_score, 1)
|
| 1995 |
+
if views_value:
|
| 1996 |
+
quality_score += 2
|
| 1997 |
+
if likes_value or comments_value:
|
| 1998 |
+
quality_score += 1
|
| 1999 |
+
candidate_rows.append((quality_score, row))
|
| 2000 |
+
if len(candidate_rows) >= 40:
|
| 2001 |
+
break
|
| 2002 |
+
|
| 2003 |
+
ranking_column = next(
|
| 2004 |
+
(
|
| 2005 |
+
column
|
| 2006 |
+
for column in columns
|
| 2007 |
+
if column.lower() in {{
|
| 2008 |
+
"views",
|
| 2009 |
+
"view_count",
|
| 2010 |
+
"viewers",
|
| 2011 |
+
"viewer_count",
|
| 2012 |
+
"watchers",
|
| 2013 |
+
"watching",
|
| 2014 |
+
"likes",
|
| 2015 |
+
"like_count",
|
| 2016 |
+
"comments",
|
| 2017 |
+
"comment_count",
|
| 2018 |
+
"stars",
|
| 2019 |
+
"star_count",
|
| 2020 |
+
"forks",
|
| 2021 |
+
"fork_count",
|
| 2022 |
+
}}
|
| 2023 |
+
),
|
| 2024 |
+
None,
|
| 2025 |
+
)
|
| 2026 |
+
|
| 2027 |
+
if ranking_column:
|
| 2028 |
+
candidate_rows.sort(key=lambda pair: (_metric_numeric(pair[1].get(ranking_column, "")), pair[0]), reverse=True)
|
| 2029 |
+
elif keywords:
|
| 2030 |
+
candidate_rows.sort(key=lambda pair: pair[0], reverse=True)
|
| 2031 |
+
|
| 2032 |
+
for _, row in candidate_rows:
|
| 2033 |
+
rows.append(row)
|
| 2034 |
if len(rows) >= 25:
|
| 2035 |
break
|
| 2036 |
|
|
|
|
| 2121 |
navigation_mode = "llm"
|
| 2122 |
except Exception as e:
|
| 2123 |
logger.warning("LLM navigation decision failed, using heuristic fallback: %s", e)
|
| 2124 |
+
target_url = _apply_text_render_proxy(target_url)
|
| 2125 |
|
| 2126 |
# Tool call: LLM navigation planning
|
| 2127 |
yield _record_step(
|
|
|
|
| 2433 |
|
| 2434 |
Return ONLY executable Python code, no explanations or markdown:"""
|
| 2435 |
|
| 2436 |
+
extraction_code = _fallback_extraction_code(
|
| 2437 |
+
request.output_instructions,
|
| 2438 |
+
request.instructions,
|
| 2439 |
+
)
|
| 2440 |
codegen_mode = "heuristic"
|
| 2441 |
if live_llm_enabled:
|
| 2442 |
try:
|
|
|
|
| 2509 |
"html": nav_obs.page_html,
|
| 2510 |
"url": target_url,
|
| 2511 |
"re": re,
|
| 2512 |
+
"urljoin": urljoin,
|
| 2513 |
"urlparse": urlparse,
|
| 2514 |
"BeautifulSoup": BeautifulSoup,
|
| 2515 |
"extracted_data": [], # LLM code should populate this
|
| 2516 |
}
|
| 2517 |
output_columns: list[str] = []
|
| 2518 |
+
execution_mode = codegen_mode
|
| 2519 |
|
| 2520 |
try:
|
| 2521 |
# Execute the LLM-generated code
|
|
|
|
| 2528 |
extracted_data,
|
| 2529 |
request.output_instructions,
|
| 2530 |
)
|
| 2531 |
+
requested_limit = _requested_row_limit(request.instructions, default_limit=25)
|
| 2532 |
+
extracted_data = extracted_data[:requested_limit]
|
| 2533 |
+
relevance_score = _rows_relevance_score(extracted_data, request.instructions)
|
| 2534 |
+
|
| 2535 |
+
if not _rows_have_signal(extracted_data):
|
| 2536 |
+
if codegen_mode == "llm":
|
| 2537 |
+
try:
|
| 2538 |
+
heuristic_code = _fallback_extraction_code(
|
| 2539 |
+
request.output_instructions,
|
| 2540 |
+
request.instructions,
|
| 2541 |
+
)
|
| 2542 |
+
heuristic_globals = {
|
| 2543 |
+
**sandbox_globals,
|
| 2544 |
+
"extracted_data": [],
|
| 2545 |
+
}
|
| 2546 |
+
exec(heuristic_code, heuristic_globals)
|
| 2547 |
+
heuristic_data = heuristic_globals.get("extracted_data", [])
|
| 2548 |
+
if not isinstance(heuristic_data, list):
|
| 2549 |
+
heuristic_data = [heuristic_data] if heuristic_data else []
|
| 2550 |
+
heuristic_data, heuristic_columns = _enforce_requested_schema(
|
| 2551 |
+
heuristic_data,
|
| 2552 |
+
request.output_instructions,
|
| 2553 |
+
)
|
| 2554 |
+
heuristic_data = heuristic_data[:requested_limit]
|
| 2555 |
+
if _rows_have_signal(heuristic_data):
|
| 2556 |
+
extracted_data = heuristic_data
|
| 2557 |
+
output_columns = heuristic_columns or output_columns
|
| 2558 |
+
execution_mode = "llm_with_heuristic_recovery"
|
| 2559 |
+
except Exception as recovery_error:
|
| 2560 |
+
logger.warning("Heuristic recovery after empty LLM extraction failed: %s", recovery_error)
|
| 2561 |
+
|
| 2562 |
+
if not _rows_have_signal(extracted_data):
|
| 2563 |
+
text_render_payload = _fetch_text_render_markdown(target_url, timeout_seconds=12)
|
| 2564 |
+
if text_render_payload:
|
| 2565 |
+
text_markdown, text_render_url = text_render_payload
|
| 2566 |
+
try:
|
| 2567 |
+
text_data, text_columns = _extract_rows_from_text_render(
|
| 2568 |
+
markdown=text_markdown,
|
| 2569 |
+
source_url=text_render_url,
|
| 2570 |
+
output_instructions=request.output_instructions,
|
| 2571 |
+
instructions=request.instructions,
|
| 2572 |
+
row_limit=requested_limit,
|
| 2573 |
+
)
|
| 2574 |
+
if _rows_have_signal(text_data):
|
| 2575 |
+
extracted_data = text_data
|
| 2576 |
+
output_columns = text_columns or output_columns
|
| 2577 |
+
execution_mode = "text_render_recovery"
|
| 2578 |
+
target_url = text_render_url
|
| 2579 |
+
except Exception as text_recovery_error:
|
| 2580 |
+
logger.warning("Text-render recovery after empty extraction failed: %s", text_recovery_error)
|
| 2581 |
+
|
| 2582 |
+
relevance_score = _rows_relevance_score(extracted_data, request.instructions)
|
| 2583 |
+
recovery_keywords = _instruction_keywords(request.instructions, max_keywords=8)
|
| 2584 |
+
if _rows_have_signal(extracted_data) and recovery_keywords and relevance_score < 0.22:
|
| 2585 |
+
step_num += 1
|
| 2586 |
+
yield _record_step(
|
| 2587 |
+
session,
|
| 2588 |
+
ScrapeStep(
|
| 2589 |
+
step_number=step_num,
|
| 2590 |
+
action="tool_call",
|
| 2591 |
+
url=target_url,
|
| 2592 |
+
status="running",
|
| 2593 |
+
message="agent.recover_relevance(query)",
|
| 2594 |
+
extracted_data={
|
| 2595 |
+
"tool_name": "agent.recover_relevance",
|
| 2596 |
+
"tool_description": "Search-guided relevance recovery for low-signal extraction output",
|
| 2597 |
+
"parameters": {
|
| 2598 |
+
"keywords": recovery_keywords,
|
| 2599 |
+
"baseline_relevance": round(relevance_score, 3),
|
| 2600 |
+
},
|
| 2601 |
+
},
|
| 2602 |
+
timestamp=_now_iso(),
|
| 2603 |
+
),
|
| 2604 |
+
)
|
| 2605 |
+
|
| 2606 |
+
recovered_rows, recovered_columns, recovered_source, recovered_score = await _search_recovery_rows(
|
| 2607 |
+
base_url=url,
|
| 2608 |
+
instructions=request.instructions,
|
| 2609 |
+
output_instructions=request.output_instructions,
|
| 2610 |
+
row_limit=requested_limit,
|
| 2611 |
+
)
|
| 2612 |
+
improved = _rows_have_signal(recovered_rows) and recovered_score > (relevance_score + 0.05)
|
| 2613 |
+
if improved:
|
| 2614 |
+
extracted_data = recovered_rows
|
| 2615 |
+
output_columns = recovered_columns or output_columns
|
| 2616 |
+
target_url = recovered_source or target_url
|
| 2617 |
+
execution_mode = "search_recovery"
|
| 2618 |
+
relevance_score = recovered_score
|
| 2619 |
+
|
| 2620 |
+
yield _record_step(
|
| 2621 |
+
session,
|
| 2622 |
+
ScrapeStep(
|
| 2623 |
+
step_number=step_num,
|
| 2624 |
+
action="tool_call",
|
| 2625 |
+
url=target_url,
|
| 2626 |
+
status="complete",
|
| 2627 |
+
message=(
|
| 2628 |
+
f"agent.recover_relevance() → {'improved' if improved else 'no_change'} "
|
| 2629 |
+
f"({relevance_score:.2f})"
|
| 2630 |
+
),
|
| 2631 |
+
extracted_data={
|
| 2632 |
+
"tool_name": "agent.recover_relevance",
|
| 2633 |
+
"result": {
|
| 2634 |
+
"improved": improved,
|
| 2635 |
+
"relevance": round(relevance_score, 3),
|
| 2636 |
+
"recovered_rows": len(recovered_rows),
|
| 2637 |
+
"source": recovered_source,
|
| 2638 |
+
},
|
| 2639 |
+
},
|
| 2640 |
+
reward=0.1 if improved else 0.0,
|
| 2641 |
+
timestamp=_now_iso(),
|
| 2642 |
+
),
|
| 2643 |
+
)
|
| 2644 |
+
if improved:
|
| 2645 |
+
total_reward += 0.1
|
| 2646 |
|
| 2647 |
+
has_signal = _rows_have_signal(extracted_data)
|
| 2648 |
+
exec_reward = 0.5 if has_signal else 0.1
|
| 2649 |
total_reward += exec_reward
|
| 2650 |
|
| 2651 |
yield _record_step(
|
|
|
|
| 2661 |
"tool_description": "Execute extraction code in sandbox",
|
| 2662 |
"result": {
|
| 2663 |
"items_extracted": len(extracted_data),
|
| 2664 |
+
"has_signal": has_signal,
|
| 2665 |
+
"relevance_score": round(relevance_score, 3),
|
| 2666 |
+
"mode": execution_mode,
|
| 2667 |
"columns": output_columns,
|
| 2668 |
"sample": extracted_data[:2] if extracted_data else [],
|
| 2669 |
},
|
|
|
|
| 2686 |
extracted_data,
|
| 2687 |
request.output_instructions,
|
| 2688 |
)
|
| 2689 |
+
requested_limit = _requested_row_limit(request.instructions, default_limit=25)
|
| 2690 |
+
extracted_data = extracted_data[:requested_limit]
|
| 2691 |
total_reward += 0.05
|
| 2692 |
|
| 2693 |
yield _record_step(
|
|
|
|
| 2740 |
|
| 2741 |
# Store extracted data in session
|
| 2742 |
if request.output_format == OutputFormat.CSV and extracted_data:
|
| 2743 |
+
existing_rows: list[dict[str, Any]] = []
|
| 2744 |
+
existing_sources: list[str] = []
|
| 2745 |
+
existing_payload = session.get("extracted_data")
|
| 2746 |
+
if isinstance(existing_payload, dict):
|
| 2747 |
+
if isinstance(existing_payload.get("rows"), list):
|
| 2748 |
+
existing_rows = [row for row in existing_payload["rows"] if isinstance(row, dict)]
|
| 2749 |
+
if isinstance(existing_payload.get("sources"), list):
|
| 2750 |
+
existing_sources = [str(value) for value in existing_payload["sources"]]
|
| 2751 |
+
|
| 2752 |
+
merged_rows = [*existing_rows, *extracted_data]
|
| 2753 |
+
fieldnames = output_columns or list(extracted_data[0].keys())
|
| 2754 |
+
|
| 2755 |
+
deduped_rows: list[dict[str, Any]] = []
|
| 2756 |
+
seen_keys: set[tuple[str, ...]] = set()
|
| 2757 |
+
for row in merged_rows:
|
| 2758 |
+
normalized_row = {field: str(row.get(field, "")) for field in fieldnames}
|
| 2759 |
+
row_key = tuple(normalized_row[field] for field in fieldnames)
|
| 2760 |
+
if row_key in seen_keys:
|
| 2761 |
+
continue
|
| 2762 |
+
seen_keys.add(row_key)
|
| 2763 |
+
deduped_rows.append(normalized_row)
|
| 2764 |
+
|
| 2765 |
+
requested_limit = _requested_row_limit(request.instructions, default_limit=25)
|
| 2766 |
+
deduped_rows = deduped_rows[:requested_limit]
|
| 2767 |
+
|
| 2768 |
output_buffer = io.StringIO()
|
| 2769 |
+
writer = csv.DictWriter(output_buffer, fieldnames=fieldnames)
|
| 2770 |
+
writer.writeheader()
|
| 2771 |
+
writer.writerows(deduped_rows)
|
| 2772 |
+
|
| 2773 |
+
merged_sources = [*existing_sources]
|
| 2774 |
+
if target_url not in merged_sources:
|
| 2775 |
+
merged_sources.append(target_url)
|
| 2776 |
|
| 2777 |
session["extracted_data"] = {
|
| 2778 |
"csv_output": output_buffer.getvalue(),
|
| 2779 |
+
"rows": deduped_rows,
|
| 2780 |
+
"columns": fieldnames,
|
| 2781 |
+
"row_count": len(deduped_rows),
|
| 2782 |
+
"sources": merged_sources,
|
| 2783 |
}
|
| 2784 |
else:
|
| 2785 |
+
current_payload = session.get("extracted_data")
|
| 2786 |
+
merged_payload: dict[str, Any] = {}
|
| 2787 |
+
if isinstance(current_payload, dict) and "csv_output" not in current_payload:
|
| 2788 |
+
merged_payload.update(current_payload)
|
| 2789 |
+
merged_payload[target_url] = extracted_data
|
| 2790 |
+
session["extracted_data"] = merged_payload
|
| 2791 |
|
| 2792 |
total_reward += 0.1
|
| 2793 |
|
|
@@ -90,6 +90,21 @@ def _extract_domains_from_assets(assets: list[str]) -> list[str]:
|
|
| 90 |
return domains
|
| 91 |
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
def match_site_template(instructions: str, assets: list[str]) -> SiteTemplate | None:
|
| 94 |
"""Match site template by URL domain first, then instruction aliases."""
|
| 95 |
|
|
@@ -106,7 +121,7 @@ def match_site_template(instructions: str, assets: list[str]) -> SiteTemplate |
|
|
| 106 |
# Alias fallback
|
| 107 |
for template in SITE_TEMPLATES:
|
| 108 |
alias_tokens = [template.name.lower(), template.site_id.lower(), *[alias.lower() for alias in template.aliases]]
|
| 109 |
-
if any(
|
| 110 |
return template
|
| 111 |
|
| 112 |
return None
|
|
|
|
| 90 |
return domains
|
| 91 |
|
| 92 |
|
| 93 |
+
def _instruction_contains_alias(instructions_lower: str, token: str) -> bool:
|
| 94 |
+
"""Return True when an alias token is present as a semantic token, not a substring artifact."""
|
| 95 |
+
|
| 96 |
+
alias = token.strip().lower()
|
| 97 |
+
if not alias:
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
if " " in alias:
|
| 101 |
+
return alias in instructions_lower
|
| 102 |
+
|
| 103 |
+
# Avoid one-letter alias false positives (e.g. "x" in "extract").
|
| 104 |
+
pattern = rf"(^|[^a-z0-9]){re.escape(alias)}([^a-z0-9]|$)"
|
| 105 |
+
return re.search(pattern, instructions_lower) is not None
|
| 106 |
+
|
| 107 |
+
|
| 108 |
def match_site_template(instructions: str, assets: list[str]) -> SiteTemplate | None:
|
| 109 |
"""Match site template by URL domain first, then instruction aliases."""
|
| 110 |
|
|
|
|
| 121 |
# Alias fallback
|
| 122 |
for template in SITE_TEMPLATES:
|
| 123 |
alias_tokens = [template.name.lower(), template.site_id.lower(), *[alias.lower() for alias in template.aliases]]
|
| 124 |
+
if any(_instruction_contains_alias(instructions_lower, token) for token in alias_tokens):
|
| 125 |
return template
|
| 126 |
|
| 127 |
return None
|
|
@@ -54,3 +54,24 @@ def test_registry_serialization_covers_all_templates() -> None:
|
|
| 54 |
serialized_ids = {item["site_id"] for item in serialized}
|
| 55 |
template_ids = {template.site_id for template in SITE_TEMPLATES}
|
| 56 |
assert serialized_ids == template_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
serialized_ids = {item["site_id"] for item in serialized}
|
| 55 |
template_ids = {template.site_id for template in SITE_TEMPLATES}
|
| 56 |
assert serialized_ids == template_ids
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def test_alias_matching_avoids_single_char_false_positive() -> None:
|
| 60 |
+
"""Single-character aliases should not match inside larger words (e.g. 'x' in 'extract')."""
|
| 61 |
+
|
| 62 |
+
matched = match_site_template(
|
| 63 |
+
"Find and extract top scraping frameworks with details",
|
| 64 |
+
["open source scraping frameworks comparison"],
|
| 65 |
+
)
|
| 66 |
+
assert matched is None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_alias_matching_still_supports_explicit_x_reference() -> None:
|
| 70 |
+
"""Explicit references to X should still match the X template."""
|
| 71 |
+
|
| 72 |
+
matched = match_site_template(
|
| 73 |
+
"Get top posts from x today",
|
| 74 |
+
["social trend query"],
|
| 75 |
+
)
|
| 76 |
+
assert matched is not None
|
| 77 |
+
assert matched.site_id == "x"
|
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"sites": 10,
|
| 3 |
+
"csv_pass": 10,
|
| 4 |
+
"json_pass": 10,
|
| 5 |
+
"combined_pass": 10,
|
| 6 |
+
"results": [
|
| 7 |
+
{
|
| 8 |
+
"site": "https://python.org/",
|
| 9 |
+
"csv_header": [
|
| 10 |
+
"title",
|
| 11 |
+
"url",
|
| 12 |
+
"content"
|
| 13 |
+
],
|
| 14 |
+
"csv_rows": 5,
|
| 15 |
+
"json_keys": [
|
| 16 |
+
"title",
|
| 17 |
+
"url",
|
| 18 |
+
"content"
|
| 19 |
+
],
|
| 20 |
+
"json_rows": 5,
|
| 21 |
+
"csv_ok": true,
|
| 22 |
+
"json_ok": true,
|
| 23 |
+
"sample_csv": "title,url,content\nPython,https://python.org/,Python\nPSF,https://www.python.org/psf/,PSF",
|
| 24 |
+
"sample_json": "[{\"title\": \"Python\", \"url\": \"https://python.org/\", \"content\": \"Python\"}, {\"title\": \"PSF\", \"url\": \"https://www.python.org/psf/\", \"content\": \"PSF\"}]"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"site": "https://pypi.org/",
|
| 28 |
+
"csv_header": [
|
| 29 |
+
"title",
|
| 30 |
+
"url",
|
| 31 |
+
"content"
|
| 32 |
+
],
|
| 33 |
+
"csv_rows": 5,
|
| 34 |
+
"json_keys": [
|
| 35 |
+
"title",
|
| 36 |
+
"url",
|
| 37 |
+
"content"
|
| 38 |
+
],
|
| 39 |
+
"json_rows": 5,
|
| 40 |
+
"csv_ok": true,
|
| 41 |
+
"json_ok": true,
|
| 42 |
+
"sample_csv": "title,url,content\nDocs,https://docs.pypi.org/,Docs\nSponsors,https://pypi.org/sponsors/,Sponsors",
|
| 43 |
+
"sample_json": "[{\"title\": \"Docs\", \"url\": \"https://docs.pypi.org/\", \"content\": \"Docs\"}, {\"title\": \"Sponsors\", \"url\": \"https://pypi.org/sponsors/\", \"content\": \"Sponsors\"}]"
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"site": "https://kubernetes.io/",
|
| 47 |
+
"csv_header": [
|
| 48 |
+
"title",
|
| 49 |
+
"url",
|
| 50 |
+
"content"
|
| 51 |
+
],
|
| 52 |
+
"csv_rows": 5,
|
| 53 |
+
"json_keys": [
|
| 54 |
+
"title",
|
| 55 |
+
"url",
|
| 56 |
+
"content"
|
| 57 |
+
],
|
| 58 |
+
"json_rows": 5,
|
| 59 |
+
"csv_ok": true,
|
| 60 |
+
"json_ok": true,
|
| 61 |
+
"sample_csv": "title,url,content\nKubernetes,https://kubernetes.io/,Kubernetes\nDocumentation,https://kubernetes.io/docs/home/,Documentation",
|
| 62 |
+
"sample_json": "[{\"title\": \"Kubernetes\", \"url\": \"https://kubernetes.io/\", \"content\": \"Kubernetes\"}, {\"title\": \"Documentation\", \"url\": \"https://kubernetes.io/docs/home/\", \"content\": \"Documentation\"}]"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"site": "https://huggingface.co/",
|
| 66 |
+
"csv_header": [
|
| 67 |
+
"title",
|
| 68 |
+
"url",
|
| 69 |
+
"content"
|
| 70 |
+
],
|
| 71 |
+
"csv_rows": 5,
|
| 72 |
+
"json_keys": [
|
| 73 |
+
"title",
|
| 74 |
+
"url",
|
| 75 |
+
"content"
|
| 76 |
+
],
|
| 77 |
+
"json_rows": 5,
|
| 78 |
+
"csv_ok": true,
|
| 79 |
+
"json_ok": true,
|
| 80 |
+
"sample_csv": "title,url,content\nHugging Face,https://huggingface.co/,Hugging Face\nModels,https://huggingface.co/models,Models",
|
| 81 |
+
"sample_json": "[{\"title\": \"Hugging Face\", \"url\": \"https://huggingface.co/\", \"content\": \"Hugging Face\"}, {\"title\": \"Models\", \"url\": \"https://huggingface.co/models\", \"content\": \"Models\"}]"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"site": "https://news.ycombinator.com/",
|
| 85 |
+
"csv_header": [
|
| 86 |
+
"title",
|
| 87 |
+
"url",
|
| 88 |
+
"content"
|
| 89 |
+
],
|
| 90 |
+
"csv_rows": 5,
|
| 91 |
+
"json_keys": [
|
| 92 |
+
"title",
|
| 93 |
+
"url",
|
| 94 |
+
"content"
|
| 95 |
+
],
|
| 96 |
+
"json_rows": 5,
|
| 97 |
+
"csv_ok": true,
|
| 98 |
+
"json_ok": true,
|
| 99 |
+
"sample_csv": "title,url,content\nHacker News,https://news.ycombinator.com/news,Hacker News new | past | comments | ask | show | jobs | submit login\nnew,https://news.ycombinator.com/newest,Hacker News new | past | comments | ask | show | jobs | submit login",
|
| 100 |
+
"sample_json": "[{\"title\": \"Hacker News\", \"url\": \"https://news.ycombinator.com/news\", \"content\": \"Hacker News new | past | comments | ask | show | jobs | submit login\"}, {\"title\": \"new\", \"url\": \"https://news.ycombinator.com/newest\", \"content\": \"Hacker News new | past | comments | ask | show | jobs | submit login\"}]"
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"site": "https://docs.docker.com/",
|
| 104 |
+
"csv_header": [
|
| 105 |
+
"title",
|
| 106 |
+
"url",
|
| 107 |
+
"content"
|
| 108 |
+
],
|
| 109 |
+
"csv_rows": 6,
|
| 110 |
+
"json_keys": [
|
| 111 |
+
"title",
|
| 112 |
+
"url",
|
| 113 |
+
"content"
|
| 114 |
+
],
|
| 115 |
+
"json_rows": 5,
|
| 116 |
+
"csv_ok": true,
|
| 117 |
+
"json_ok": true,
|
| 118 |
+
"sample_csv": "title,url,content\nGet started,https://docs.docker.com/get-started/,Get started\nGuides,https://docs.docker.com/guides/,Guides",
|
| 119 |
+
"sample_json": "[{\"title\": \"Get started\", \"url\": \"https://docs.docker.com/get-started/\", \"content\": \"Get started\"}, {\"title\": \"Guides\", \"url\": \"https://docs.docker.com/guides/\", \"content\": \"Guides\"}]"
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"site": "https://mozilla.org/",
|
| 123 |
+
"csv_header": [
|
| 124 |
+
"title",
|
| 125 |
+
"url",
|
| 126 |
+
"content"
|
| 127 |
+
],
|
| 128 |
+
"csv_rows": 5,
|
| 129 |
+
"json_keys": [
|
| 130 |
+
"title",
|
| 131 |
+
"url",
|
| 132 |
+
"content"
|
| 133 |
+
],
|
| 134 |
+
"json_rows": 5,
|
| 135 |
+
"csv_ok": true,
|
| 136 |
+
"json_ok": true,
|
| 137 |
+
"sample_csv": "title,url,content\nProducts,https://mozilla.org/en-US/products/,Products\nMozilla VPN,https://mozilla.org/en-US/products/vpn/,Mozilla VPN",
|
| 138 |
+
"sample_json": "[{\"title\": \"Products\", \"url\": \"https://mozilla.org/en-US/products/\", \"content\": \"Products\"}, {\"title\": \"Mozilla VPN\", \"url\": \"https://mozilla.org/en-US/products/vpn/\", \"content\": \"Mozilla VPN\"}]"
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"site": "https://go.dev/",
|
| 142 |
+
"csv_header": [
|
| 143 |
+
"title",
|
| 144 |
+
"url",
|
| 145 |
+
"content"
|
| 146 |
+
],
|
| 147 |
+
"csv_rows": 5,
|
| 148 |
+
"json_keys": [
|
| 149 |
+
"title",
|
| 150 |
+
"url",
|
| 151 |
+
"content"
|
| 152 |
+
],
|
| 153 |
+
"json_rows": 5,
|
| 154 |
+
"csv_ok": true,
|
| 155 |
+
"json_ok": true,
|
| 156 |
+
"sample_csv": "title,url,content\nCase Studies,https://go.dev/solutions/case-studies,Case Studies\nUse Cases,https://go.dev/solutions/use-cases,Use Cases",
|
| 157 |
+
"sample_json": "[{\"title\": \"Case Studies\", \"url\": \"https://go.dev/solutions/case-studies\", \"content\": \"Case Studies\"}, {\"title\": \"Use Cases\", \"url\": \"https://go.dev/solutions/use-cases\", \"content\": \"Use Cases\"}]"
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"site": "https://example.com/",
|
| 161 |
+
"csv_header": [
|
| 162 |
+
"title",
|
| 163 |
+
"url",
|
| 164 |
+
"content"
|
| 165 |
+
],
|
| 166 |
+
"csv_rows": 1,
|
| 167 |
+
"json_keys": [
|
| 168 |
+
"title",
|
| 169 |
+
"url",
|
| 170 |
+
"content"
|
| 171 |
+
],
|
| 172 |
+
"json_rows": 1,
|
| 173 |
+
"csv_ok": true,
|
| 174 |
+
"json_ok": true,
|
| 175 |
+
"sample_csv": "title,url,content\nLearn more,https://iana.org/domains/example,Example Domain This domain is for use in documentation examples without needing permission. Avoid use in operations. Learn more",
|
| 176 |
+
"sample_json": "[{\"title\": \"Learn more\", \"url\": \"https://iana.org/domains/example\", \"content\": \"Example Domain This domain is for use in documentation examples without needing permission. Avoid use in operations. Learn more\"}]"
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"site": "https://www.w3.org/",
|
| 180 |
+
"csv_header": [
|
| 181 |
+
"title",
|
| 182 |
+
"url",
|
| 183 |
+
"content"
|
| 184 |
+
],
|
| 185 |
+
"csv_rows": 5,
|
| 186 |
+
"json_keys": [
|
| 187 |
+
"title",
|
| 188 |
+
"url",
|
| 189 |
+
"content"
|
| 190 |
+
],
|
| 191 |
+
"json_rows": 5,
|
| 192 |
+
"csv_ok": true,
|
| 193 |
+
"json_ok": true,
|
| 194 |
+
"sample_csv": "title,url,content\nNews,https://www.w3.org/news/,\"Latest news Latest entries from across our News , Press Releases or Blog .\"\nPress Releases,https://www.w3.org/press-releases/,\"Latest news Latest entries from across our News , Press Releases or Blog .\"",
|
| 195 |
+
"sample_json": "[{\"title\": \"News\", \"url\": \"https://www.w3.org/news/\", \"content\": \"Latest news Latest entries from across our News , Press Releases or Blog .\"}, {\"title\": \"Press Releases\", \"url\": \"https://www.w3.org/press-releases/\", \"content\": \"Latest news Latest entries from across our News , Press Releases or Blog .\"}]"
|
| 196 |
+
}
|
| 197 |
+
]
|
| 198 |
+
}
|
|
@@ -1,133 +0,0 @@
|
|
| 1 |
-
# Template Stress Test Report
|
| 2 |
-
|
| 3 |
-
## Scope
|
| 4 |
-
- Template targets: **56**
|
| 5 |
-
- Non-template targets: **5**
|
| 6 |
-
- Iterations per target: **100**
|
| 7 |
-
- Total runs: **6100**
|
| 8 |
-
- Modes cycled per target: **question**, **csv**, **json**
|
| 9 |
-
- Execution mode: deterministic offline mocks (`SCRAPERL_DISABLE_LIVE_LLM=1`)
|
| 10 |
-
|
| 11 |
-
## Aggregate Result
|
| 12 |
-
- Completed: **6100**
|
| 13 |
-
- Partial: **0**
|
| 14 |
-
- Failed: **0**
|
| 15 |
-
- Pass rate (completed/total): **100.00%**
|
| 16 |
-
- Schema failures: **0**
|
| 17 |
-
- Output-format mismatches: **0**
|
| 18 |
-
- Duration: **81.16 seconds**
|
| 19 |
-
|
| 20 |
-
## Per-Template Results
|
| 21 |
-
| Template | Runs | Completed | Partial | Failed |
|
| 22 |
-
|---|---:|---:|---:|---:|
|
| 23 |
-
| `airbnb` | 100 | 100 | 0 | 0 |
|
| 24 |
-
| `aliexpress` | 100 | 100 | 0 | 0 |
|
| 25 |
-
| `amazon` | 100 | 100 | 0 | 0 |
|
| 26 |
-
| `arxiv` | 100 | 100 | 0 | 0 |
|
| 27 |
-
| `bbc` | 100 | 100 | 0 | 0 |
|
| 28 |
-
| `bitbucket` | 100 | 100 | 0 | 0 |
|
| 29 |
-
| `bloomberg` | 100 | 100 | 0 | 0 |
|
| 30 |
-
| `booking` | 100 | 100 | 0 | 0 |
|
| 31 |
-
| `cnn` | 100 | 100 | 0 | 0 |
|
| 32 |
-
| `coindesk` | 100 | 100 | 0 | 0 |
|
| 33 |
-
| `coinmarketcap` | 100 | 100 | 0 | 0 |
|
| 34 |
-
| `coursera` | 100 | 100 | 0 | 0 |
|
| 35 |
-
| `devto` | 100 | 100 | 0 | 0 |
|
| 36 |
-
| `ebay` | 100 | 100 | 0 | 0 |
|
| 37 |
-
| `edx` | 100 | 100 | 0 | 0 |
|
| 38 |
-
| `etsy` | 100 | 100 | 0 | 0 |
|
| 39 |
-
| `facebook` | 100 | 100 | 0 | 0 |
|
| 40 |
-
| `freecodecamp` | 100 | 100 | 0 | 0 |
|
| 41 |
-
| `geeksforgeeks` | 100 | 100 | 0 | 0 |
|
| 42 |
-
| `github` | 100 | 100 | 0 | 0 |
|
| 43 |
-
| `gitlab` | 100 | 100 | 0 | 0 |
|
| 44 |
-
| `glassdoor` | 100 | 100 | 0 | 0 |
|
| 45 |
-
| `googlescholar` | 100 | 100 | 0 | 0 |
|
| 46 |
-
| `hackernews` | 100 | 100 | 0 | 0 |
|
| 47 |
-
| `huggingface` | 100 | 100 | 0 | 0 |
|
| 48 |
-
| `imdb` | 100 | 100 | 0 | 0 |
|
| 49 |
-
| `indeed` | 100 | 100 | 0 | 0 |
|
| 50 |
-
| `instagram` | 100 | 100 | 0 | 0 |
|
| 51 |
-
| `investopedia` | 100 | 100 | 0 | 0 |
|
| 52 |
-
| `kaggle` | 100 | 100 | 0 | 0 |
|
| 53 |
-
| `leetcode` | 100 | 100 | 0 | 0 |
|
| 54 |
-
| `linkedin` | 100 | 100 | 0 | 0 |
|
| 55 |
-
| `medium` | 100 | 100 | 0 | 0 |
|
| 56 |
-
| `npm` | 100 | 100 | 0 | 0 |
|
| 57 |
-
| `nytimes` | 100 | 100 | 0 | 0 |
|
| 58 |
-
| `openreview` | 100 | 100 | 0 | 0 |
|
| 59 |
-
| `paperswithcode` | 100 | 100 | 0 | 0 |
|
| 60 |
-
| `pinterest` | 100 | 100 | 0 | 0 |
|
| 61 |
-
| `producthunt` | 100 | 100 | 0 | 0 |
|
| 62 |
-
| `pypi` | 100 | 100 | 0 | 0 |
|
| 63 |
-
| `quora` | 100 | 100 | 0 | 0 |
|
| 64 |
-
| `reddit` | 100 | 100 | 0 | 0 |
|
| 65 |
-
| `reuters` | 100 | 100 | 0 | 0 |
|
| 66 |
-
| `soundcloud` | 100 | 100 | 0 | 0 |
|
| 67 |
-
| `spotify` | 100 | 100 | 0 | 0 |
|
| 68 |
-
| `stackoverflow` | 100 | 100 | 0 | 0 |
|
| 69 |
-
| `substack` | 100 | 100 | 0 | 0 |
|
| 70 |
-
| `tiktok` | 100 | 100 | 0 | 0 |
|
| 71 |
-
| `twitch` | 100 | 100 | 0 | 0 |
|
| 72 |
-
| `udemy` | 100 | 100 | 0 | 0 |
|
| 73 |
-
| `vimeo` | 100 | 100 | 0 | 0 |
|
| 74 |
-
| `walmart` | 100 | 100 | 0 | 0 |
|
| 75 |
-
| `wikipedia` | 100 | 100 | 0 | 0 |
|
| 76 |
-
| `x` | 100 | 100 | 0 | 0 |
|
| 77 |
-
| `youtube` | 100 | 100 | 0 | 0 |
|
| 78 |
-
| `zillow` | 100 | 100 | 0 | 0 |
|
| 79 |
-
|
| 80 |
-
## Non-Template Results
|
| 81 |
-
| Asset | Runs | Completed | Partial | Failed |
|
| 82 |
-
|---|---:|---:|---:|---:|
|
| 83 |
-
| `https://unknown-synth-alpha.test` | 100 | 100 | 0 | 0 |
|
| 84 |
-
| `https://unknown-synth-beta.test` | 100 | 100 | 0 | 0 |
|
| 85 |
-
| `https://unknown-synth-gamma.test` | 100 | 100 | 0 | 0 |
|
| 86 |
-
| `open source scraping tools benchmark` | 100 | 100 | 0 | 0 |
|
| 87 |
-
| `synthetic market intelligence dashboard comparison` | 100 | 100 | 0 | 0 |
|
| 88 |
-
|
| 89 |
-
## Failure Samples
|
| 90 |
-
- No failures captured.
|
| 91 |
-
|
| 92 |
-
## Notes
|
| 93 |
-
- Templates are used as **reference hints** (navigation targets/field hints), not rigid scraper scripts.
|
| 94 |
-
- Agent flow evaluates **assets + instructions + output_format + output_instructions** per request.
|
| 95 |
-
- Output schema validation checks strict column adherence for CSV/JSON runs.
|
| 96 |
-
- Raw machine summary: `docs/reports/template-stress-summary.json`.
|
| 97 |
-
|
| 98 |
-
---
|
| 99 |
-
|
| 100 |
-
## Additional Run: Non-Template Existing Domains (Question/CSV/JSON)
|
| 101 |
-
|
| 102 |
-
- Target domains: **35** (non-template existing sites)
|
| 103 |
-
- Output modes: **question**, **csv**, **json**
|
| 104 |
-
- Total cases: **105** (35 × 3)
|
| 105 |
-
- Completed: **105**
|
| 106 |
-
- Partial: **0**
|
| 107 |
-
- Failed: **0**
|
| 108 |
-
- Schema failures: **0**
|
| 109 |
-
- Output-format mismatches: **0**
|
| 110 |
-
- Duration: **1.8 seconds** (deterministic offline fixture mode)
|
| 111 |
-
|
| 112 |
-
Raw summary: `docs/reports/non-template-existing-summary.json`.
|
| 113 |
-
|
| 114 |
-
---
|
| 115 |
-
|
| 116 |
-
## Additional Run: Strict Output Contract Matrix (100 Template + 100 Non-Template)
|
| 117 |
-
|
| 118 |
-
- Template cases: **100**
|
| 119 |
-
- Non-template cases: **100**
|
| 120 |
-
- Total cases: **200**
|
| 121 |
-
- Modes: mixed **csv/json/text**
|
| 122 |
-
- Max steps configured per case: **999** (effectively unlimited)
|
| 123 |
-
- Completed: **200**
|
| 124 |
-
- Partial: **0**
|
| 125 |
-
- Failed: **0**
|
| 126 |
-
- Contract failures (requested schema mismatch): **0**
|
| 127 |
-
|
| 128 |
-
### Contract checks enforced per case
|
| 129 |
-
- CSV: returned `columns` must **exactly equal** requested CSV columns (order + names), and CSV header must match.
|
| 130 |
-
- JSON: first row keys must **exactly equal** requested JSON columns (order + names).
|
| 131 |
-
- Text/question: output must be non-empty.
|
| 132 |
-
|
| 133 |
-
Raw summary: `docs/reports/output-contract-200-summary.json`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|