NeerajCodz commited on
Commit
c2f6d26
·
1 Parent(s): 02cc090

feat: identify working LLM models and improve debugging

Browse files

- Tested Gemini API and discovered API key has access to gemini-2.5-flash and gemini-2.0-flash (NOT 1.5)
- Verified Groq llama-3.3-70b-versatile works successfully for extraction
- Confirmed LLM extraction pipeline IS working: example.com returns proper JSON with Groq
- Added comprehensive debug logging to track model resolution through router and providers
- Fixed GoogleProvider 404 error handler to extract actual model name from request URL
- Clear Python cache before testing to ensure code changes are picked up

VERIFIED: AI-driven extraction works with Groq on simple sites
TODO: Debug why CSV/JSON output not appearing in final streaming response for complex sites

Files changed (2) hide show
  1. backend/hacker_news.csv +6 -31
  2. test_response.txt +66 -0
backend/hacker_news.csv CHANGED
@@ -1,31 +1,6 @@
1
- title,link,points
2
- ,,914 points
3
- ,,392 points
4
- ,,172 points
5
- ,,541 points
6
- ,,417 points
7
- ,,193 points
8
- ,,197 points
9
- ,,283 points
10
- ,,253 points
11
- ,,8 points
12
- ,,84 points
13
- ,,19 points
14
- ,,703 points
15
- ,,66 points
16
- ,,25 points
17
- ,,280 points
18
- ,,125 points
19
- ,,11 points
20
- ,,148 points
21
- ,,112 points
22
- ,,160 points
23
- ,,52 points
24
- ,,394 points
25
- ,,123 points
26
- ,,103 points
27
- ,,152 points
28
- ,,83 points
29
- ,,105 points
30
- ,,254 points
31
- ,,
 
1
+ title,points
2
+ ,199 points
3
+ ,293 points
4
+ ,990 points
5
+ ,464 points
6
+ ,576 points
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_response.txt ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data: {"type": "init", "session_id": "26d3ce6a-99dd-4354-80de-4b7b014af1ee"}
2
+
3
+ data: {"type": "step", "data": {"step_number": 0, "action": "plugins", "url": null, "status": "completed", "message": "No plugins enabled", "reward": 0.0, "extracted_data": {"requested": [], "enabled": [], "missing": [], "navigation_strategy": "single_page", "extraction_goal": "basic_extraction", "site_template_id": null, "site_template_name": null, "site_template_domains": []}, "duration_ms": null, "timestamp": "2026-04-08T03:48:13.713825+00:00"}}
4
+
5
+ data: {"type": "step", "data": {"step_number": 2, "action": "planner", "url": null, "status": "completed", "message": "Planner created execution plan for 1 assets", "reward": 0.15, "extracted_data": {"assets": ["https://example.com"], "instructions": "Extract the heading and paragraph", "output_instructions": "json with heading and paragraph fields", "site_template": null}, "duration_ms": null, "timestamp": "2026-04-08T03:48:13.715043+00:00"}}
6
+
7
+ data: {"type": "step", "data": {"step_number": 3, "action": "tool_call", "url": null, "status": "running", "message": "url.parse(urls=[\"https://example.com\"], count=1)", "reward": 0.0, "extracted_data": {"tool_name": "url.parse", "tool_description": "Parse and validate target URLs", "parameters": {"urls": ["https://example.com"], "count": 1}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:13.715193+00:00"}}
8
+
9
+ data: {"type": "step", "data": {"step_number": 4, "action": "tool_call", "url": null, "status": "completed", "message": "url.parse() \u2192 parsed=1, domains=['example.com']", "reward": 0.05, "extracted_data": {"tool_name": "url.parse", "tool_description": "Parse and validate target URLs", "parameters": {"urls": ["https://example.com"]}, "result": {"parsed": 1, "domains": ["example.com"]}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:13.715299+00:00"}}
10
+
11
+ data: {"type": "step", "data": {"step_number": 5, "action": "navigator", "url": "https://example.com", "status": "running", "message": "Navigator selected source 1/1 (single_page)", "reward": 0.05, "extracted_data": {"site_template_id": null, "site_template_name": null}, "duration_ms": null, "timestamp": "2026-04-08T03:48:13.716452+00:00"}}
12
+
13
+ data: {"type": "url_start", "url": "https://example.com", "index": 0, "total": 1}
14
+
15
+ data: {"type": "step", "data": {"step_number": 1, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "llm.plan_navigation() \u2192 https://example.com", "reward": 0.15, "extracted_data": {"tool_name": "llm.plan_navigation", "tool_description": "LLM decides optimal navigation URL based on instructions", "parameters": {"instructions": "Extract the heading and paragraph", "base_url": "https://example.com"}, "result": "https://example.com", "mode": "llm"}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.074626+00:00"}}
16
+
17
+ data: {"type": "step", "data": {"step_number": 2, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "validate.url(url='https://example.com') \u2192 valid", "reward": 0.05, "extracted_data": {"tool_name": "validate.url", "tool_description": "Validate and normalize navigation URL", "parameters": {"url": "https://example.com"}, "result": {"valid": true, "normalized_url": "https://example.com"}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.074849+00:00"}}
18
+
19
+ data: {"type": "step", "data": {"step_number": 3, "action": "tool_call", "url": "https://example.com", "status": "running", "message": "browser.navigate(url='https://example.com')", "reward": 0.0, "extracted_data": {"tool_name": "browser.navigate", "tool_description": "Navigate browser to target URL", "parameters": {"url": "https://example.com", "wait_for": "page_load"}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.074942+00:00"}}
20
+
21
+ data: {"type": "step", "data": {"step_number": 3, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "browser.navigate() \u2192 Success", "reward": 0.6500000000000001, "extracted_data": {"tool_name": "browser.navigate", "tool_description": "Navigate browser to target URL", "parameters": {"url": "https://example.com"}, "result": {"status_code": true}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.405862+00:00"}}
22
+
23
+ data: {"type": "step", "data": {"step_number": 4, "action": "tool_call", "url": "https://example.com", "status": "running", "message": "html.parse(html=page_content)", "reward": 0.0, "extracted_data": {"tool_name": "html.parse", "tool_description": "Parse HTML into DOM structure", "parameters": {"content_length": 528}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.406039+00:00"}}
24
+
25
+ data: {"type": "step", "data": {"step_number": 4, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "html.parse() \u2192 DOM ready", "reward": 0.1, "extracted_data": {"tool_name": "html.parse", "tool_description": "Parse HTML into DOM structure", "result": {"elements_count": 11}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.407017+00:00"}}
26
+
27
+ data: {"type": "step", "data": {"step_number": 5, "action": "tool_call", "url": "https://example.com", "status": "running", "message": "extract.urls(html)", "reward": 0.0, "extracted_data": {"tool_name": "extract.urls", "tool_description": "Extract hyperlinks from parsed HTML", "parameters": {"scope": "document"}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.407113+00:00"}}
28
+
29
+ data: {"type": "step", "data": {"step_number": 5, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "extract.urls() \u2192 1 links", "reward": 0.05, "extracted_data": {"tool_name": "extract.urls", "result": {"count": 1, "sample": ["https://iana.org/domains/example"]}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.407242+00:00"}}
30
+
31
+ data: {"type": "step", "data": {"step_number": 6, "action": "tool_call", "url": "https://example.com", "status": "running", "message": "extract.emails(html)", "reward": 0.0, "extracted_data": {"tool_name": "extract.emails", "tool_description": "Extract email addresses from page content", "parameters": {"pattern": "email regex"}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.407309+00:00"}}
32
+
33
+ data: {"type": "step", "data": {"step_number": 6, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "extract.emails() \u2192 0 emails", "reward": 0.05, "extracted_data": {"tool_name": "extract.emails", "result": {"count": 0, "sample": []}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.407375+00:00"}}
34
+
35
+ data: {"type": "step", "data": {"step_number": 7, "action": "tool_call", "url": "https://example.com", "status": "running", "message": "html.extract(fields=['title','content','links'])", "reward": 0.0, "extracted_data": {"tool_name": "html.extract", "tool_description": "Extract key structural fields for downstream processing", "parameters": {"fields": ["title", "content", "links"]}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.407423+00:00"}}
36
+
37
+ data: {"type": "step", "data": {"step_number": 7, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "html.extract() \u2192 fields ready", "reward": 0.05, "extracted_data": {"tool_name": "html.extract", "result": {"title_length": 14, "content_length": 142, "link_count": 1}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.407525+00:00"}}
38
+
39
+ data: {"type": "step", "data": {"step_number": 8, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "llm.generate_extraction_code() \u2192 533 chars", "reward": 0.2, "extracted_data": {"tool_name": "llm.generate_extraction_code", "tool_description": "Generate extraction code from page context and requested output schema", "parameters": {"html_sample_length": 528, "instructions": "Extract the heading and paragraph", "output_format": "json"}, "result": {"code_length": 533, "mode": "llm"}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.894135+00:00"}}
40
+
41
+ data: {"type": "step", "data": {"step_number": 9, "action": "tool_call", "url": "https://example.com", "status": "running", "message": "sandbox.execute(code=llm_generated_code)", "reward": 0.0, "extracted_data": {"tool_name": "sandbox.execute", "tool_description": "Execute LLM-generated extraction code in sandboxed Python environment", "parameters": {"code_length": 533, "timeout": 30}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:14.894332+00:00"}}
42
+
43
+ data: {"type": "step", "data": {"step_number": 10, "action": "tool_call", "url": "https://example.com", "status": "running", "message": "agent.recover_relevance(query)", "reward": 0.0, "extracted_data": {"tool_name": "agent.recover_relevance", "tool_description": "Search-guided relevance recovery for empty extraction output", "parameters": {"keywords": ["heading", "paragraph"], "baseline_relevance": 0.0}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:15.509034+00:00"}}
44
+
45
+ data: {"type": "step", "data": {"step_number": 10, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "agent.recover_relevance() \u2192 no_change (0.00)", "reward": 0.0, "extracted_data": {"tool_name": "agent.recover_relevance", "result": {"improved": false, "relevance": 0.0, "recovered_rows": 0, "source": null}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.295843+00:00"}}
46
+
47
+ data: {"type": "step", "data": {"step_number": 10, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "sandbox.execute() \u2192 Extracted 1 items", "reward": 0.1, "extracted_data": {"tool_name": "sandbox.execute", "tool_description": "Execute extraction code in sandbox", "result": {"items_extracted": 1, "has_signal": false, "relevance_score": 0.0, "mode": "llm", "columns": ["json_with_heading", "paragraph_fields"], "sample": [{"json_with_heading": "", "paragraph_fields": ""}]}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.296085+00:00"}}
48
+
49
+ data: {"type": "step", "data": {"step_number": 11, "action": "tool_call", "url": "https://example.com", "status": "running", "message": "json.dumps(data=extracted_items)", "reward": 0.0, "extracted_data": {"tool_name": "json.dumps", "tool_description": "Format extracted data as JSON", "parameters": {"item_count": 1}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.296187+00:00"}}
50
+
51
+ data: {"type": "step", "data": {"step_number": 11, "action": "tool_call", "url": "https://example.com", "status": "complete", "message": "json.dumps() \u2192 Output ready", "reward": 0.1, "extracted_data": {"tool_name": "json.dumps", "tool_description": "Format extracted data as JSON", "result": {"format": "json", "size": 1}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.296247+00:00"}}
52
+
53
+ data: {"type": "step", "data": {"step_number": 12, "action": "complete", "url": "https://example.com", "status": "complete", "message": "Agentic scraping complete: 1 items extracted", "reward": 1.5000000000000002, "extracted_data": {"item_count": 1}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.296297+00:00"}}
54
+
55
+ data: {"type": "url_complete", "url": "https://example.com", "index": 0}
56
+
57
+ data: {"type": "step", "data": {"step_number": 26, "action": "tool_call", "url": null, "status": "running", "message": "json.dumps(data, format='json')", "reward": 0.0, "extracted_data": {"tool_name": "json.dumps", "tool_description": "Format extracted data as JSON", "parameters": {"output_format": "json", "data_keys": ["https://example.com"]}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.296408+00:00"}}
58
+
59
+ data: {"type": "step", "data": {"step_number": 26, "action": "tool_call", "url": null, "status": "completed", "message": "json.dumps() \u2192 106 bytes", "reward": 0.05, "extracted_data": {"tool_name": "json.dumps", "result": {"output_length": 106, "format": "json"}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.296473+00:00"}}
60
+
61
+ data: {"type": "step", "data": {"step_number": 28, "action": "tool_call", "url": null, "status": "running", "message": "memory.store(key='summary', type='LONG_TERM')", "reward": 0.0, "extracted_data": {"tool_name": "memory.store", "tool_description": "Store scrape summary in long-term memory", "parameters": {"key": "scrape:26d3ce6a-99dd-4354-80de-4b7b014af1ee:summary", "memory_type": "LONG_TERM", "output_length": 106}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.299964+00:00"}}
62
+
63
+ data: {"type": "step", "data": {"step_number": 28, "action": "tool_call", "url": null, "status": "completed", "message": "memory.store() \u2192 stored", "reward": 0.05, "extracted_data": {"tool_name": "memory.store", "result": {"stored": true, "key": "scrape:26d3ce6a-99dd-4354-80de-4b7b014af1ee:summary"}}, "duration_ms": null, "timestamp": "2026-04-08T03:48:52.300622+00:00"}}
64
+
65
+ data: {"type": "complete", "data": {"session_id": "26d3ce6a-99dd-4354-80de-4b7b014af1ee", "status": "completed", "total_steps": 29, "total_reward": 0.0, "extracted_data": {"https://example.com": [{"json_with_heading": "", "paragraph_fields": ""}]}, "output": "{\n \"https://example.com\": [\n {\n \"json_with_heading\": \"\",\n \"paragraph_fields\": \"\"\n }\n ]\n}", "output_format": "json", "duration_seconds": 38.58329486846924, "urls_processed": 1, "errors": [], "enabled_plugins": [], "requested_plugins": [], "selected_agents": [], "memory_enabled": true, "sandbox_artifacts": ["final_extracted_data.json", "final_output.json", "memory_request.json", "memory_summary.txt"]}}
66
+