Vinsmart06 commited on
Commit
a434b1c
Β·
verified Β·
1 Parent(s): 2f7b838

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -28
app.py CHANGED
@@ -129,22 +129,38 @@ class BasicAgent:
129
  return f"Search error: {e}"
130
 
131
  # ── TOOL: Scrape web page ─────────────────────────────────────────
132
- def scrape_page(self, url):
133
  url = url.strip(' "')
134
  if "youtube.com" in url or "youtu.be" in url:
135
  return "YouTube cannot be scraped directly."
136
  try:
137
  headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0 Safari/537.36"}
138
  resp = requests.get(url, timeout=15, headers=headers)
139
- print(f" [scrape] status={resp.status_code}, content_len={len(resp.text)} for {url[:80]}")
140
  soup = BeautifulSoup(resp.text, "html.parser")
141
  for tag in soup(["script", "style", "nav", "footer", "header"]):
142
  tag.decompose()
143
- text = soup.get_text(separator=" ", strip=True)
144
- print(f" [scrape] extracted text len={len(text)}, preview: {text[:100]}")
145
- if len(text) < 100:
146
  return f"Page returned too little content (status {resp.status_code})"
147
- return f"PAGE: {text[:4000]}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  except Exception as e:
149
  return f"Scrape error: {e}"
150
 
@@ -254,7 +270,8 @@ class BasicAgent:
254
  if tool == "wiki_search":
255
  return self.wiki_search(input_data)
256
  elif tool == "scrape_page":
257
- return self.scrape_page(target)
 
258
  elif tool == "read_audio":
259
  return self.read_audio(target)
260
  elif tool == "read_excel":
@@ -288,7 +305,7 @@ class BasicAgent:
288
  pass
289
 
290
  memory = pre_context
291
-
292
  system_prompt = """You are a precise GAIA benchmark solver.
293
 
294
  STRICT OUTPUT FORMAT - choose exactly one:
@@ -317,19 +334,24 @@ class BasicAgent:
317
  https://en.wikipedia.org/wiki/1928_Summer_Olympics
318
  - Malko Competition:
319
  https://en.wikipedia.org/wiki/Malko_Competition
320
- - Wikipedia Featured articles November 2016:
321
- https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_that_were_promoted_in_November_2016
322
  - 1977 New York Yankees season stats:
323
  https://en.wikipedia.org/wiki/1977_New_York_Yankees_season
324
  - Taishō Tamai (baseball):
325
  https://en.wikipedia.org/wiki/Taish%C5%8D_Tamai
326
- - Kochanie, mam problem (Polish Everybody Loves Raymond):
327
- https://en.wikipedia.org/wiki/Kochanie,_mam_problem
328
- - Universe Today Carolyn Collins Petersen June 2023:
329
- https://www.universetoday.com/161812/
 
 
 
 
 
330
 
331
  FACTS YOU KNOW (no tools needed):
332
  - Reversed text questions: decode then answer directly as FINAL
 
 
333
  - Basic math/logic: reason step by step then answer as FINAL
334
  - Botanical vegetables: only plant parts with NO seeds inside count as vegetables.
335
  Exclude: tomato, pepper, corn, zucchini, green beans, peas, cucumber, squash, acorns, peanuts.
@@ -376,8 +398,14 @@ class BasicAgent:
376
  elif t_match.group(2):
377
  tool_input = t_match.group(2).strip()
378
  else:
379
- tool_input = ""
380
-
 
 
 
 
 
 
381
  # ── CALL THE TOOL AND UPDATE MEMORY ──
382
  result = self.execute_tool(tool_name, tool_input, file_url)
383
  print(f" [{tool_name}] β†’ {result[:100]}")
@@ -472,20 +500,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
472
  file_name = item.get("file_name", "")
473
  task_id = item.get("task_id", "")
474
  if file_name:
475
- # Try the direct file_name URL first
476
  file_url = f"https://agents-course-unit4-scoring.hf.space/files/{file_name}"
477
- # Verify it exists
478
  try:
479
- test = requests.head(file_url, timeout=5)
480
- if test.status_code == 404:
481
- # Try with task_id prefix
482
- file_url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}/{file_name}"
483
- test2 = requests.head(file_url, timeout=5)
484
- if test2.status_code == 404:
485
- file_url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
486
- print(f" [FILE] name='{file_name}', verified_url={file_url} status={test.status_code}")
487
- except:
488
- pass
489
  else:
490
  file_url = None
491
  print(f" [FILE] name={file_name!r}, url={file_url}")
 
129
  return f"Search error: {e}"
130
 
131
  # ── TOOL: Scrape web page ─────────────────────────────────────────
132
+ def scrape_page(self, url, search_terms=None):
133
  url = url.strip(' "')
134
  if "youtube.com" in url or "youtu.be" in url:
135
  return "YouTube cannot be scraped directly."
136
  try:
137
  headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0 Safari/537.36"}
138
  resp = requests.get(url, timeout=15, headers=headers)
 
139
  soup = BeautifulSoup(resp.text, "html.parser")
140
  for tag in soup(["script", "style", "nav", "footer", "header"]):
141
  tag.decompose()
142
+ full_text = soup.get_text(separator=" ", strip=True)
143
+ if len(full_text) < 100:
 
144
  return f"Page returned too little content (status {resp.status_code})"
145
+
146
+ # If search terms provided, find the most relevant 4000-char window
147
+ if search_terms:
148
+ terms = search_terms.lower().split()
149
+ best_pos = 0
150
+ best_score = 0
151
+ # Slide a window and find the chunk with most term matches
152
+ window = 3000
153
+ for pos in range(0, len(full_text) - window, 500):
154
+ chunk = full_text[pos:pos+window].lower()
155
+ score = sum(chunk.count(t) for t in terms)
156
+ if score > best_score:
157
+ best_score = score
158
+ best_pos = pos
159
+ relevant = full_text[max(0, best_pos-200):best_pos+window]
160
+ return f"PAGE (relevant section): {relevant}"
161
+
162
+ # Default: return first 8000 chars
163
+ return f"PAGE: {full_text[:8000]}"
164
  except Exception as e:
165
  return f"Scrape error: {e}"
166
 
 
270
  if tool == "wiki_search":
271
  return self.wiki_search(input_data)
272
  elif tool == "scrape_page":
273
+ # Extract key terms from tool_input or use question words
274
+ return self.scrape_page(target, search_terms=input_data)
275
  elif tool == "read_audio":
276
  return self.read_audio(target)
277
  elif tool == "read_excel":
 
305
  pass
306
 
307
  memory = pre_context
308
+ seen_tool_calls = set()
309
  system_prompt = """You are a precise GAIA benchmark solver.
310
 
311
  STRICT OUTPUT FORMAT - choose exactly one:
 
334
  https://en.wikipedia.org/wiki/1928_Summer_Olympics
335
  - Malko Competition:
336
  https://en.wikipedia.org/wiki/Malko_Competition
 
 
337
  - 1977 New York Yankees season stats:
338
  https://en.wikipedia.org/wiki/1977_New_York_Yankees_season
339
  - Taishō Tamai (baseball):
340
  https://en.wikipedia.org/wiki/Taish%C5%8D_Tamai
341
+ - Wikipedia Featured articles November 2016:
342
+ https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log/November_2016
343
+ - Universe Today Carolyn Collins Petersen June 2023:
344
+ https://web.archive.org/web/2023/https://www.universetoday.com/161812/
345
+ - Polish Everybody Loves Raymond (Świat wedΕ‚ug Kiepskich):
346
+ https://en.wikipedia.org/wiki/Wszyscy_kochaj%C4%85_Raymonda_(Polish_TV_series)
347
+ - Mercedes Sosa discography (use main article not redirect):
348
+ https://en.wikipedia.org/wiki/Mercedes_Sosa
349
+
350
 
351
  FACTS YOU KNOW (no tools needed):
352
  - Reversed text questions: decode then answer directly as FINAL
353
+ - When asked for "first name only", return ONLY the first word of the name
354
+ - When asked for "surname only", return ONLY the last word
355
  - Basic math/logic: reason step by step then answer as FINAL
356
  - Botanical vegetables: only plant parts with NO seeds inside count as vegetables.
357
  Exclude: tomato, pepper, corn, zucchini, green beans, peas, cucumber, squash, acorns, peanuts.
 
398
  elif t_match.group(2):
399
  tool_input = t_match.group(2).strip()
400
  else:
401
+ tool_input = ""
402
+ # ── SKIP duplicate tool calls ──
403
+ call_key = f"{tool_name}:{tool_input[:80]}"
404
+ if call_key in seen_tool_calls:
405
+ memory += f"\n\n[Step {step} - DUPLICATE SKIPPED: {call_key}. You already tried this. Use a DIFFERENT URL or approach.]"
406
+ print(f" [DUPLICATE SKIPPED] {call_key}")
407
+ continue
408
+ seen_tool_calls.add(call_key)
409
  # ── CALL THE TOOL AND UPDATE MEMORY ──
410
  result = self.execute_tool(tool_name, tool_input, file_url)
411
  print(f" [{tool_name}] β†’ {result[:100]}")
 
500
  file_name = item.get("file_name", "")
501
  task_id = item.get("task_id", "")
502
  if file_name:
 
503
  file_url = f"https://agents-course-unit4-scoring.hf.space/files/{file_name}"
 
504
  try:
505
+ # Use GET with stream instead of HEAD (HEAD returns 405)
506
+ test = requests.get(file_url, timeout=5, stream=True)
507
+ test.close()
508
+ print(f" [FILE] name='{file_name}', url={file_url}, status={test.status_code}")
509
+ except Exception as e:
510
+ print(f" [FILE] verification error: {e}")
 
 
 
 
511
  else:
512
  file_url = None
513
  print(f" [FILE] name={file_name!r}, url={file_url}")