NeerajCodz Copilot commited on
Commit
027ebd4
·
1 Parent(s): 9a34438

test: verify strict output contracts on 200-case matrix

Browse files

- add strict output-contract runner for 100 template + 100 non-template mixed cases
- run with max_steps=999 to allow effectively unlimited agent steps
- enforce exact requested CSV/JSON schema matching (no extra columns)
- update docs/test-report.md with contract validation results
- result: 200/200 completed, 0 contract mismatches

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

backend/tests/test_api/run_output_contract_matrix.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Strict output-contract matrix: 100 template + 100 non-template cases."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ import time
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any
12
+ from urllib.parse import urlparse
13
+
14
+ from fastapi.testclient import TestClient
15
+
16
+ from app.api.routes import scrape as scrape_routes
17
+ from app.core.env import WebScraperEnv
18
+ from app.main import app
19
+ from app.sites.templates import SITE_TEMPLATES
20
+
21
+ BASE_PLUGINS = ["mcp-browser", "mcp-search", "mcp-html"]
22
+ DEFAULT_AGENTS = ["planner", "navigator", "extractor", "verifier"]
23
+ CASE_COUNT_PER_BUCKET = 100
24
+
25
+ NON_TEMPLATE_ASSETS = [
26
+ "https://www.python.org/",
27
+ "https://www.mozilla.org/",
28
+ "https://www.apple.com/",
29
+ "https://www.microsoft.com/",
30
+ "https://openai.com/",
31
+ "https://www.cloudflare.com/",
32
+ "https://www.digitalocean.com/",
33
+ "https://www.oracle.com/",
34
+ "https://www.ibm.com/",
35
+ "https://www.cisco.com/",
36
+ "https://www.adobe.com/",
37
+ "https://slack.com/",
38
+ "https://www.notion.so/",
39
+ "https://vercel.com/",
40
+ "https://www.netlify.com/",
41
+ "https://www.heroku.com/",
42
+ "https://www.docker.com/",
43
+ "https://kubernetes.io/",
44
+ "https://ubuntu.com/",
45
+ "https://www.debian.org/",
46
+ "https://archlinux.org/",
47
+ "https://www.rust-lang.org/",
48
+ "https://go.dev/",
49
+ "https://nodejs.org/",
50
+ "https://deno.com/",
51
+ "https://www.postgresql.org/",
52
+ "https://www.mysql.com/",
53
+ "https://www.sqlite.org/",
54
+ "https://www.apache.org/",
55
+ "https://nginx.org/",
56
+ "https://home.cern/",
57
+ "https://www.nasa.gov/",
58
+ "https://www.who.int/",
59
+ "https://www.un.org/",
60
+ "https://example.com/",
61
+ "open source scraping frameworks comparison",
62
+ "synthetic unknown portal data feed",
63
+ ]
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class ContractCase:
68
+ bucket: str # template or non-template
69
+ id: str
70
+ asset: str
71
+ mode: str # csv/json/text
72
+ output_format: str
73
+ instructions: str
74
+ output_instructions: str
75
+ expected_columns: tuple[str, ...]
76
+
77
+
78
+ def _build_html_payload(url: str) -> str:
79
+ parsed = urlparse(url)
80
+ domain = parsed.netloc or "example.com"
81
+ path = parsed.path or "/"
82
+ slug = path.strip("/").replace("/", "-") or "home"
83
+ return f"""
84
+ <html>
85
+ <head>
86
+ <title>{domain} :: {slug}</title>
87
+ <meta name="description" content="Mock page for {domain} and {slug}" />
88
+ </head>
89
+ <body>
90
+ <h1>{domain} heading</h1>
91
+ <p>Mock content for {url}. Contact: test+{slug}@example.com</p>
92
+ <article class="card">
93
+ <h2><a href="/alpha/repo-one">alpha / repo-one</a></h2>
94
+ <div>stars 1,234 forks 210</div>
95
+ </article>
96
+ <article class="card">
97
+ <h2><a href="/beta/repo-two">beta / repo-two</a></h2>
98
+ <div>stars 987 forks 145</div>
99
+ </article>
100
+ <a href="https://{domain}/about">About</a>
101
+ <a href="https://{domain}/contact">Contact</a>
102
+ </body>
103
+ </html>
104
+ """
105
+
106
+
107
+ def _requested_columns(output_instructions: str) -> tuple[str, ...]:
108
+ cleaned = output_instructions.strip()
109
+ cleaned = re.sub(r"^(?:csv|json|table)\s+of\s+", "", cleaned, flags=re.IGNORECASE)
110
+ cleaned = cleaned.replace(" and ", ", ")
111
+ columns: list[str] = []
112
+ for piece in cleaned.split(","):
113
+ value = re.sub(r"[^A-Za-z0-9_]+", " ", piece).strip().lower().replace(" ", "_")
114
+ if value and value not in columns:
115
+ columns.append(value)
116
+ return tuple(columns)
117
+
118
+
119
+ def _build_template_cases() -> list[ContractCase]:
120
+ cases: list[ContractCase] = []
121
+ templates = list(SITE_TEMPLATES)
122
+ for index in range(CASE_COUNT_PER_BUCKET):
123
+ template = templates[index % len(templates)]
124
+ mode = ("csv", "json", "text")[index % 3]
125
+ fields = tuple(str(field).lower() for field in template.output_fields[:4]) or ("title", "url")
126
+ asset = f"https://{template.domains[0]}"
127
+ case_id = f"template-{index + 1:03d}-{template.site_id}"
128
+ if mode == "csv":
129
+ output_instructions = f"csv of {', '.join(fields)}"
130
+ cases.append(
131
+ ContractCase(
132
+ bucket="template",
133
+ id=case_id,
134
+ asset=asset,
135
+ mode=mode,
136
+ output_format="csv",
137
+ instructions=f"Extract top visible {template.extraction_goal} records from this asset.",
138
+ output_instructions=output_instructions,
139
+ expected_columns=_requested_columns(output_instructions),
140
+ )
141
+ )
142
+ elif mode == "json":
143
+ output_instructions = f"json of {', '.join(fields)}"
144
+ cases.append(
145
+ ContractCase(
146
+ bucket="template",
147
+ id=case_id,
148
+ asset=asset,
149
+ mode=mode,
150
+ output_format="json",
151
+ instructions=f"Extract structured {template.extraction_goal} entities.",
152
+ output_instructions=output_instructions,
153
+ expected_columns=_requested_columns(output_instructions),
154
+ )
155
+ )
156
+ else:
157
+ cases.append(
158
+ ContractCase(
159
+ bucket="template",
160
+ id=case_id,
161
+ asset=asset,
162
+ mode=mode,
163
+ output_format="text",
164
+ instructions=f"What are the top visible {template.extraction_goal} on this target?",
165
+ output_instructions="Answer in concise plain text.",
166
+ expected_columns=(),
167
+ )
168
+ )
169
+ return cases
170
+
171
+
172
+ def _build_non_template_cases() -> list[ContractCase]:
173
+ cases: list[ContractCase] = []
174
+ assets = NON_TEMPLATE_ASSETS
175
+ csv_contracts = [
176
+ "csv of title, url, content",
177
+ "csv of username, repo, stars, forks",
178
+ "csv of name, url, summary",
179
+ ]
180
+ json_contracts = [
181
+ "json of title, url, content",
182
+ "json of entity, metric, value",
183
+ "json of name, url, summary",
184
+ ]
185
+ for index in range(CASE_COUNT_PER_BUCKET):
186
+ asset = assets[index % len(assets)]
187
+ mode = ("csv", "json", "text")[index % 3]
188
+ case_id = f"non-template-{index + 1:03d}"
189
+ if mode == "csv":
190
+ output_instructions = csv_contracts[index % len(csv_contracts)]
191
+ cases.append(
192
+ ContractCase(
193
+ bucket="non-template",
194
+ id=case_id,
195
+ asset=asset,
196
+ mode=mode,
197
+ output_format="csv",
198
+ instructions="Extract key entities and metadata from this asset.",
199
+ output_instructions=output_instructions,
200
+ expected_columns=_requested_columns(output_instructions),
201
+ )
202
+ )
203
+ elif mode == "json":
204
+ output_instructions = json_contracts[index % len(json_contracts)]
205
+ cases.append(
206
+ ContractCase(
207
+ bucket="non-template",
208
+ id=case_id,
209
+ asset=asset,
210
+ mode=mode,
211
+ output_format="json",
212
+ instructions="Extract key entities and metadata from this asset.",
213
+ output_instructions=output_instructions,
214
+ expected_columns=_requested_columns(output_instructions),
215
+ )
216
+ )
217
+ else:
218
+ cases.append(
219
+ ContractCase(
220
+ bucket="non-template",
221
+ id=case_id,
222
+ asset=asset,
223
+ mode=mode,
224
+ output_format="text",
225
+ instructions="What is on this target and what are the most relevant points?",
226
+ output_instructions="Answer in concise plain text.",
227
+ expected_columns=(),
228
+ )
229
+ )
230
+ return cases
231
+
232
+
233
+ def _build_payload(case: ContractCase) -> dict[str, Any]:
234
+ return {
235
+ "assets": [case.asset],
236
+ "instructions": case.instructions,
237
+ "output_instructions": case.output_instructions,
238
+ "output_format": case.output_format,
239
+ "complexity": "high",
240
+ "model": "llama-3.1-70b-versatile",
241
+ "provider": "groq",
242
+ "enable_memory": True,
243
+ "enable_plugins": list(BASE_PLUGINS),
244
+ "selected_agents": list(DEFAULT_AGENTS),
245
+ "max_steps": 999, # effectively unlimited for this matrix
246
+ }
247
+
248
+
249
+ def _collect_stream_events(client: TestClient, payload: dict[str, Any]) -> list[dict[str, Any]]:
250
+ events: list[dict[str, Any]] = []
251
+ with client.stream("POST", "/api/scrape/stream", json=payload) as response:
252
+ if response.status_code != 200:
253
+ raise RuntimeError(f"stream request failed with status {response.status_code}")
254
+ for raw_line in response.iter_lines():
255
+ if not raw_line:
256
+ continue
257
+ line = raw_line if isinstance(raw_line, str) else raw_line.decode("utf-8", errors="ignore")
258
+ if not line.startswith("data: "):
259
+ continue
260
+ try:
261
+ events.append(json.loads(line[6:]))
262
+ except json.JSONDecodeError:
263
+ continue
264
+ return events
265
+
266
+
267
+ def _csv_header(output: str) -> tuple[str, ...]:
268
+ first_line = output.splitlines()[0] if output else ""
269
+ if not first_line:
270
+ return tuple()
271
+ return tuple(part.strip().lower() for part in first_line.split(","))
272
+
273
+
274
+ def _extract_first_rows(extracted_data: Any) -> list[dict[str, Any]]:
275
+ if isinstance(extracted_data, dict):
276
+ if isinstance(extracted_data.get("rows"), list):
277
+ return extracted_data.get("rows", [])
278
+ for value in extracted_data.values():
279
+ if isinstance(value, list):
280
+ return value
281
+ return []
282
+
283
+
284
+ def _contract_ok(complete_data: dict[str, Any], case: ContractCase) -> tuple[bool, str]:
285
+ if str(complete_data.get("output_format", "")) != case.output_format:
286
+ return False, "output_format mismatch"
287
+
288
+ if case.output_format == "text":
289
+ output = complete_data.get("output")
290
+ return (isinstance(output, str) and bool(output.strip())), "empty text output"
291
+
292
+ extracted_data = complete_data.get("extracted_data")
293
+ if not case.expected_columns:
294
+ return False, "missing expected contract columns"
295
+
296
+ if case.output_format == "csv":
297
+ if not isinstance(extracted_data, dict):
298
+ return False, "csv extracted_data is not dict"
299
+ columns = tuple((extracted_data.get("columns") or []))
300
+ if columns != case.expected_columns:
301
+ return False, f"csv column mismatch expected={case.expected_columns} got={columns}"
302
+ header = _csv_header(str(complete_data.get("output", "")))
303
+ if header != case.expected_columns:
304
+ return False, f"csv header mismatch expected={case.expected_columns} got={header}"
305
+ return True, ""
306
+
307
+ rows = _extract_first_rows(extracted_data)
308
+ if not rows or not isinstance(rows[0], dict):
309
+ return False, "json rows missing"
310
+ keys = tuple(rows[0].keys())
311
+ if keys != case.expected_columns:
312
+ return False, f"json key mismatch expected={case.expected_columns} got={keys}"
313
+ return True, ""
314
+
315
+
316
+ def run_matrix() -> dict[str, Any]:
317
+ os.environ["SCRAPERL_DISABLE_LIVE_LLM"] = "1"
318
+
319
+ original_execute_navigate = WebScraperEnv._execute_navigate
320
+ original_search_urls = scrape_routes._search_urls_with_mcp
321
+ original_fetch_reddit = scrape_routes._fetch_reddit_communities
322
+
323
+ async def fake_execute_navigate(self: WebScraperEnv, url: str) -> dict[str, Any]:
324
+ normalized = str(url).strip()
325
+ if not normalized.startswith("http"):
326
+ normalized = f"https://{normalized}"
327
+ self._page_content_type = "text/html; charset=utf-8"
328
+ self._page_html = _build_html_payload(normalized)
329
+ self._page_title = urlparse(normalized).netloc or "example.com"
330
+ return {
331
+ "success": True,
332
+ "url": normalized,
333
+ "status_code": 200,
334
+ "content_type": self._page_content_type,
335
+ "tls_verification_bypassed": False,
336
+ }
337
+
338
+ async def fake_search_urls(query: str, max_results: int = 6) -> list[str]:
339
+ token = re.sub(r"[^a-z0-9]+", "-", query.lower()).strip("-") or "query"
340
+ count = max(1, min(max_results, 3))
341
+ return [f"https://{token}.example.com/source-{index}" for index in range(1, count + 1)]
342
+
343
+ def fake_fetch_reddit_communities(limit: int = 25) -> tuple[list[dict[str, Any]], str]:
344
+ rows: list[dict[str, Any]] = []
345
+ for index in range(limit):
346
+ rows.append(
347
+ {
348
+ "subreddit": f"r/mockcommunity{index + 1}",
349
+ "title": f"Mock Community {index + 1}",
350
+ "subscribers": 200000 - (index * 1000),
351
+ "active_users": 15000 - (index * 100),
352
+ "url": f"https://www.reddit.com/r/mockcommunity{index + 1}/",
353
+ "description": "Offline mocked Reddit community",
354
+ }
355
+ )
356
+ return rows, "mock_reddit_json"
357
+
358
+ WebScraperEnv._execute_navigate = fake_execute_navigate
359
+ scrape_routes._search_urls_with_mcp = fake_search_urls
360
+ scrape_routes._fetch_reddit_communities = fake_fetch_reddit_communities
361
+
362
+ template_cases = _build_template_cases()
363
+ non_template_cases = _build_non_template_cases()
364
+ all_cases = [*template_cases, *non_template_cases]
365
+
366
+ started = time.time()
367
+ summary: dict[str, Any] = {
368
+ "template_cases": len(template_cases),
369
+ "non_template_cases": len(non_template_cases),
370
+ "total_cases": len(all_cases),
371
+ "completed": 0,
372
+ "partial": 0,
373
+ "failed": 0,
374
+ "contract_failures": 0,
375
+ "failures": [],
376
+ }
377
+
378
+ try:
379
+ with TestClient(app) as client:
380
+ for case in all_cases:
381
+ payload = _build_payload(case)
382
+ session_id: str | None = None
383
+ try:
384
+ events = _collect_stream_events(client, payload)
385
+ init_event = next((event for event in events if event.get("type") == "init"), None)
386
+ complete_event = next((event for event in events if event.get("type") == "complete"), None)
387
+ if not init_event or not complete_event:
388
+ raise RuntimeError("missing init/complete events")
389
+ session_id = str(init_event.get("session_id", ""))
390
+ complete_data = complete_event.get("data") or {}
391
+ status = str(complete_data.get("status", "failed"))
392
+ ok, reason = _contract_ok(complete_data, case)
393
+ if not ok:
394
+ summary["contract_failures"] += 1
395
+ raise RuntimeError(reason)
396
+ if status == "completed":
397
+ summary["completed"] += 1
398
+ else:
399
+ summary["partial"] += 1
400
+ except Exception as exc: # noqa: BLE001
401
+ summary["failed"] += 1
402
+ if len(summary["failures"]) < 40:
403
+ summary["failures"].append(
404
+ {
405
+ "case_id": case.id,
406
+ "bucket": case.bucket,
407
+ "asset": case.asset,
408
+ "mode": case.mode,
409
+ "error": str(exc),
410
+ }
411
+ )
412
+ finally:
413
+ if session_id:
414
+ client.delete(f"/api/scrape/{session_id}/cleanup")
415
+ finally:
416
+ WebScraperEnv._execute_navigate = original_execute_navigate
417
+ scrape_routes._search_urls_with_mcp = original_search_urls
418
+ scrape_routes._fetch_reddit_communities = original_fetch_reddit
419
+
420
+ summary["duration_seconds"] = round(time.time() - started, 2)
421
+ return summary
422
+
423
+
424
+ def write_summary(summary: dict[str, Any]) -> None:
425
+ project_root = Path(__file__).resolve().parents[3]
426
+ reports_dir = project_root / "docs" / "reports"
427
+ reports_dir.mkdir(parents=True, exist_ok=True)
428
+ out_path = reports_dir / "output-contract-200-summary.json"
429
+ out_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
430
+
431
+
432
+ def main() -> None:
433
+ summary = run_matrix()
434
+ write_summary(summary)
435
+ print(json.dumps(summary, indent=2))
436
+
437
+
438
+ if __name__ == "__main__":
439
+ main()
docs/reports/output-contract-200-summary.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "template_cases": 100,
3
+ "non_template_cases": 100,
4
+ "total_cases": 200,
5
+ "completed": 200,
6
+ "partial": 0,
7
+ "failed": 0,
8
+ "contract_failures": 0,
9
+ "failures": [],
10
+ "duration_seconds": 5.69
11
+ }
docs/test-report.md CHANGED
@@ -110,3 +110,24 @@
110
  - Duration: **1.8 seconds** (deterministic offline fixture mode)
111
 
112
  Raw summary: `docs/reports/non-template-existing-summary.json`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  - Duration: **1.8 seconds** (deterministic offline fixture mode)
111
 
112
  Raw summary: `docs/reports/non-template-existing-summary.json`.
113
+
114
+ ---
115
+
116
+ ## Additional Run: Strict Output Contract Matrix (100 Template + 100 Non-Template)
117
+
118
+ - Template cases: **100**
119
+ - Non-template cases: **100**
120
+ - Total cases: **200**
121
+ - Modes: mixed **csv/json/text**
122
+ - Max steps configured per case: **999** (effectively unlimited)
123
+ - Completed: **200**
124
+ - Partial: **0**
125
+ - Failed: **0**
126
+ - Contract failures (requested schema mismatch): **0**
127
+
128
+ ### Contract checks enforced per case
129
+ - CSV: returned `columns` must **exactly equal** requested CSV columns (order + names), and CSV header must match.
130
+ - JSON: first row keys must **exactly equal** requested JSON columns (order + names).
131
+ - Text/question: output must be non-empty.
132
+
133
+ Raw summary: `docs/reports/output-contract-200-summary.json`.