NeerajCodz Copilot commited on
Commit
9a34438
·
1 Parent(s): b28fad0

test: validate 35 non-template existing domains across output types

Browse files

- add non-template matrix runner for existing domains (question/csv/json)
- execute 105-case sweep (35 domains x 3 output modes)
- analyze output-format and schema adherence for each case
- update docs/test-report.md with additional run summary
- result: 105/105 completed, 0 failures

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

backend/tests/test_api/run_non_template_existing_matrix.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run non-template existing-domain matrix across question/csv/json output modes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ import time
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any
12
+ from urllib.parse import urlparse
13
+
14
+ from fastapi.testclient import TestClient
15
+
16
+ from app.api.routes import scrape as scrape_routes
17
+ from app.core.env import WebScraperEnv
18
+ from app.main import app
19
+
20
+ BASE_PLUGINS = ["mcp-browser", "mcp-search", "mcp-html"]
21
+ DEFAULT_AGENTS = ["planner", "navigator", "extractor", "verifier"]
22
+
23
+ NON_TEMPLATE_EXISTING_ASSETS = [
24
+ "https://www.python.org/",
25
+ "https://www.mozilla.org/",
26
+ "https://www.apple.com/",
27
+ "https://www.microsoft.com/",
28
+ "https://openai.com/",
29
+ "https://www.cloudflare.com/",
30
+ "https://www.digitalocean.com/",
31
+ "https://www.oracle.com/",
32
+ "https://www.ibm.com/",
33
+ "https://www.cisco.com/",
34
+ "https://www.adobe.com/",
35
+ "https://slack.com/",
36
+ "https://www.notion.so/",
37
+ "https://vercel.com/",
38
+ "https://www.netlify.com/",
39
+ "https://www.heroku.com/",
40
+ "https://www.docker.com/",
41
+ "https://kubernetes.io/",
42
+ "https://ubuntu.com/",
43
+ "https://www.debian.org/",
44
+ "https://archlinux.org/",
45
+ "https://www.rust-lang.org/",
46
+ "https://go.dev/",
47
+ "https://nodejs.org/",
48
+ "https://deno.com/",
49
+ "https://www.postgresql.org/",
50
+ "https://www.mysql.com/",
51
+ "https://www.sqlite.org/",
52
+ "https://www.apache.org/",
53
+ "https://nginx.org/",
54
+ "https://home.cern/",
55
+ "https://www.nasa.gov/",
56
+ "https://www.who.int/",
57
+ "https://www.un.org/",
58
+ "https://example.com/",
59
+ ]
60
+
61
+
62
+ @dataclass(frozen=True)
63
+ class Case:
64
+ asset: str
65
+ mode: str
66
+ output_format: str
67
+ instructions: str
68
+ output_instructions: str
69
+ expected_columns: tuple[str, ...]
70
+
71
+
72
+ def _build_html_payload(url: str) -> str:
73
+ parsed = urlparse(url)
74
+ domain = parsed.netloc or "example.com"
75
+ path = parsed.path or "/"
76
+ slug = path.strip("/").replace("/", "-") or "home"
77
+
78
+ return f"""
79
+ <html>
80
+ <head>
81
+ <title>{domain} :: {slug}</title>
82
+ <meta name="description" content="Mock page for {domain} and {slug}" />
83
+ </head>
84
+ <body>
85
+ <h1>{domain} heading</h1>
86
+ <p>Offline deterministic content for {url}. Contact: test+{slug}@example.com</p>
87
+ <article class="card">
88
+ <h2><a href="/alpha/item-one">alpha / item-one</a></h2>
89
+ <div>stars 1,234 forks 210</div>
90
+ </article>
91
+ <article class="card">
92
+ <h2><a href="/beta/item-two">beta / item-two</a></h2>
93
+ <div>stars 987 forks 145</div>
94
+ </article>
95
+ <a href="https://{domain}/about">About</a>
96
+ <a href="https://{domain}/contact">Contact</a>
97
+ </body>
98
+ </html>
99
+ """
100
+
101
+
102
+ def _requested_columns(output_instructions: str) -> tuple[str, ...]:
103
+ cleaned = output_instructions.strip()
104
+ cleaned = re.sub(r"^(?:csv|json|table)\s+of\s+", "", cleaned, flags=re.IGNORECASE)
105
+ cleaned = cleaned.replace(" and ", ", ")
106
+ columns: list[str] = []
107
+ for piece in cleaned.split(","):
108
+ value = re.sub(r"[^A-Za-z0-9_]+", " ", piece).strip().lower().replace(" ", "_")
109
+ if value and value not in columns:
110
+ columns.append(value)
111
+ return tuple(columns)
112
+
113
+
114
+ def _cases() -> list[Case]:
115
+ matrix: list[Case] = []
116
+ for asset in NON_TEMPLATE_EXISTING_ASSETS:
117
+ matrix.append(
118
+ Case(
119
+ asset=asset,
120
+ mode="question",
121
+ output_format="text",
122
+ instructions="What is the main content and key sections on this website?",
123
+ output_instructions="Answer as plain text with a concise summary.",
124
+ expected_columns=(),
125
+ )
126
+ )
127
+ csv_instruction = "csv of title, url, content"
128
+ matrix.append(
129
+ Case(
130
+ asset=asset,
131
+ mode="csv",
132
+ output_format="csv",
133
+ instructions="Extract key entities and links from this website.",
134
+ output_instructions=csv_instruction,
135
+ expected_columns=_requested_columns(csv_instruction),
136
+ )
137
+ )
138
+ json_instruction = "json of title, url, content"
139
+ matrix.append(
140
+ Case(
141
+ asset=asset,
142
+ mode="json",
143
+ output_format="json",
144
+ instructions="Extract key entities and links from this website.",
145
+ output_instructions=json_instruction,
146
+ expected_columns=_requested_columns(json_instruction),
147
+ )
148
+ )
149
+ return matrix
150
+
151
+
152
+ def _build_payload(case: Case) -> dict[str, Any]:
153
+ return {
154
+ "assets": [case.asset],
155
+ "instructions": case.instructions,
156
+ "output_instructions": case.output_instructions,
157
+ "output_format": case.output_format,
158
+ "complexity": "low",
159
+ "model": "llama-3.1-70b-versatile",
160
+ "provider": "groq",
161
+ "enable_memory": True,
162
+ "enable_plugins": list(BASE_PLUGINS),
163
+ "selected_agents": list(DEFAULT_AGENTS),
164
+ "max_steps": 30,
165
+ }
166
+
167
+
168
+ def _collect_stream_events(client: TestClient, payload: dict[str, Any]) -> list[dict[str, Any]]:
169
+ events: list[dict[str, Any]] = []
170
+ with client.stream("POST", "/api/scrape/stream", json=payload) as response:
171
+ if response.status_code != 200:
172
+ raise RuntimeError(f"stream request failed with status {response.status_code}")
173
+ for raw_line in response.iter_lines():
174
+ if not raw_line:
175
+ continue
176
+ line = raw_line if isinstance(raw_line, str) else raw_line.decode("utf-8", errors="ignore")
177
+ if not line.startswith("data: "):
178
+ continue
179
+ try:
180
+ events.append(json.loads(line[6:]))
181
+ except json.JSONDecodeError:
182
+ continue
183
+ return events
184
+
185
+
186
+ def _schema_ok(complete_data: dict[str, Any], case: Case) -> bool:
187
+ if not case.expected_columns:
188
+ output = complete_data.get("output")
189
+ return isinstance(output, str) and bool(output.strip())
190
+
191
+ extracted_data = complete_data.get("extracted_data")
192
+ if case.output_format == "csv":
193
+ if not isinstance(extracted_data, dict):
194
+ return False
195
+ return tuple(extracted_data.get("columns") or []) == case.expected_columns
196
+
197
+ if not isinstance(extracted_data, dict):
198
+ return False
199
+ rows: list[dict[str, Any]] = []
200
+ for value in extracted_data.values():
201
+ if isinstance(value, list):
202
+ rows = value
203
+ break
204
+ if not rows or not isinstance(rows[0], dict):
205
+ return False
206
+ return tuple(rows[0].keys()) == case.expected_columns
207
+
208
+
209
+ def run_matrix() -> dict[str, Any]:
210
+ os.environ["SCRAPERL_DISABLE_LIVE_LLM"] = "1"
211
+
212
+ original_execute_navigate = WebScraperEnv._execute_navigate
213
+ original_search_urls = scrape_routes._search_urls_with_mcp
214
+ original_fetch_reddit = scrape_routes._fetch_reddit_communities
215
+
216
+ async def fake_execute_navigate(self: WebScraperEnv, url: str) -> dict[str, Any]:
217
+ normalized = str(url).strip()
218
+ if not normalized.startswith("http"):
219
+ normalized = f"https://{normalized}"
220
+ self._page_content_type = "text/html; charset=utf-8"
221
+ self._page_html = _build_html_payload(normalized)
222
+ self._page_title = urlparse(normalized).netloc or "example.com"
223
+ return {
224
+ "success": True,
225
+ "url": normalized,
226
+ "status_code": 200,
227
+ "content_type": self._page_content_type,
228
+ "tls_verification_bypassed": False,
229
+ }
230
+
231
+ async def fake_search_urls(query: str, max_results: int = 6) -> list[str]:
232
+ token = re.sub(r"[^a-z0-9]+", "-", query.lower()).strip("-") or "query"
233
+ count = max(1, min(max_results, 3))
234
+ return [f"https://{token}.example.com/source-{index}" for index in range(1, count + 1)]
235
+
236
+ def fake_fetch_reddit_communities(limit: int = 25) -> tuple[list[dict[str, Any]], str]:
237
+ rows: list[dict[str, Any]] = []
238
+ for index in range(limit):
239
+ rows.append(
240
+ {
241
+ "subreddit": f"r/mockcommunity{index + 1}",
242
+ "title": f"Mock Community {index + 1}",
243
+ "subscribers": 200000 - (index * 1000),
244
+ "active_users": 15000 - (index * 100),
245
+ "url": f"https://www.reddit.com/r/mockcommunity{index + 1}/",
246
+ "description": "Offline mocked Reddit community",
247
+ }
248
+ )
249
+ return rows, "mock_reddit_json"
250
+
251
+ WebScraperEnv._execute_navigate = fake_execute_navigate
252
+ scrape_routes._search_urls_with_mcp = fake_search_urls
253
+ scrape_routes._fetch_reddit_communities = fake_fetch_reddit_communities
254
+
255
+ started = time.time()
256
+ summary: dict[str, Any] = {
257
+ "target_count": len(NON_TEMPLATE_EXISTING_ASSETS),
258
+ "cases": len(_cases()),
259
+ "completed": 0,
260
+ "partial": 0,
261
+ "failed": 0,
262
+ "schema_failures": 0,
263
+ "format_failures": 0,
264
+ "failures": [],
265
+ }
266
+
267
+ try:
268
+ with TestClient(app) as client:
269
+ for case in _cases():
270
+ payload = _build_payload(case)
271
+ session_id: str | None = None
272
+ try:
273
+ events = _collect_stream_events(client, payload)
274
+ init_event = next((event for event in events if event.get("type") == "init"), None)
275
+ complete_event = next((event for event in events if event.get("type") == "complete"), None)
276
+ if not init_event or not complete_event:
277
+ raise RuntimeError("missing init/complete events")
278
+ session_id = str(init_event.get("session_id", ""))
279
+ complete_data = complete_event.get("data") or {}
280
+ status = str(complete_data.get("status", "failed"))
281
+ output_format = str(complete_data.get("output_format", ""))
282
+ if output_format != case.output_format:
283
+ summary["format_failures"] += 1
284
+ raise RuntimeError(f"output format mismatch: expected {case.output_format}, got {output_format}")
285
+ if not _schema_ok(complete_data, case):
286
+ summary["schema_failures"] += 1
287
+ raise RuntimeError("schema check failed")
288
+
289
+ if status == "completed":
290
+ summary["completed"] += 1
291
+ else:
292
+ summary["partial"] += 1
293
+ except Exception as exc: # noqa: BLE001
294
+ summary["failed"] += 1
295
+ if len(summary["failures"]) < 30:
296
+ summary["failures"].append(
297
+ {
298
+ "asset": case.asset,
299
+ "mode": case.mode,
300
+ "error": str(exc),
301
+ }
302
+ )
303
+ finally:
304
+ if session_id:
305
+ client.delete(f"/api/scrape/{session_id}/cleanup")
306
+ finally:
307
+ WebScraperEnv._execute_navigate = original_execute_navigate
308
+ scrape_routes._search_urls_with_mcp = original_search_urls
309
+ scrape_routes._fetch_reddit_communities = original_fetch_reddit
310
+
311
+ summary["duration_seconds"] = round(time.time() - started, 2)
312
+ return summary
313
+
314
+
315
+ def write_report(summary: dict[str, Any]) -> None:
316
+ project_root = Path(__file__).resolve().parents[3]
317
+ reports_dir = project_root / "docs" / "reports"
318
+ reports_dir.mkdir(parents=True, exist_ok=True)
319
+ out_path = reports_dir / "non-template-existing-summary.json"
320
+ out_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
321
+
322
+
323
+ def main() -> None:
324
+ summary = run_matrix()
325
+ write_report(summary)
326
+ print(json.dumps(summary, indent=2))
327
+
328
+
329
+ if __name__ == "__main__":
330
+ main()
docs/reports/non-template-existing-summary.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "target_count": 35,
3
+ "cases": 105,
4
+ "completed": 105,
5
+ "partial": 0,
6
+ "failed": 0,
7
+ "schema_failures": 0,
8
+ "format_failures": 0,
9
+ "failures": [],
10
+ "duration_seconds": 1.8
11
+ }
docs/test-report.md CHANGED
@@ -94,3 +94,19 @@
94
  - Agent flow evaluates **assets + instructions + output_format + output_instructions** per request.
95
  - Output schema validation checks strict column adherence for CSV/JSON runs.
96
  - Raw machine summary: `docs/reports/template-stress-summary.json`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  - Agent flow evaluates **assets + instructions + output_format + output_instructions** per request.
95
  - Output schema validation checks strict column adherence for CSV/JSON runs.
96
  - Raw machine summary: `docs/reports/template-stress-summary.json`.
97
+
98
+ ---
99
+
100
+ ## Additional Run: Non-Template Existing Domains (Question/CSV/JSON)
101
+
102
+ - Target domains: **35** (non-template existing sites)
103
+ - Output modes: **question**, **csv**, **json**
104
+ - Total cases: **105** (35 × 3)
105
+ - Completed: **105**
106
+ - Partial: **0**
107
+ - Failed: **0**
108
+ - Schema failures: **0**
109
+ - Output-format mismatches: **0**
110
+ - Duration: **1.8 seconds** (deterministic offline fixture mode)
111
+
112
+ Raw summary: `docs/reports/non-template-existing-summary.json`.