Final_Assignment_Template

Sleeping

App Files Files Community

Vinsmart06 commited on Mar 15

Commit

a434b1c

verified ·

1 Parent(s): 2f7b838

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -28

app.py CHANGED Viewed

@@ -129,22 +129,38 @@ class BasicAgent:
             return f"Search error: {e}"
     # ── TOOL: Scrape web page ─────────────────────────────────────────
-    def scrape_page(self, url):
         url = url.strip(' "')
         if "youtube.com" in url or "youtu.be" in url:
             return "YouTube cannot be scraped directly."
         try:
             headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0 Safari/537.36"}
             resp = requests.get(url, timeout=15, headers=headers)
-            print(f"    [scrape] status={resp.status_code}, content_len={len(resp.text)} for {url[:80]}")
             soup = BeautifulSoup(resp.text, "html.parser")
             for tag in soup(["script", "style", "nav", "footer", "header"]):
                 tag.decompose()
-            text = soup.get_text(separator=" ", strip=True)
-            print(f"    [scrape] extracted text len={len(text)}, preview: {text[:100]}")
-            if len(text) < 100:
                 return f"Page returned too little content (status {resp.status_code})"
-            return f"PAGE: {text[:4000]}"
         except Exception as e:
             return f"Scrape error: {e}"
@@ -254,7 +270,8 @@ class BasicAgent:
         if tool == "wiki_search":
             return self.wiki_search(input_data)
         elif tool == "scrape_page":
-            return self.scrape_page(target)
         elif tool == "read_audio":
             return self.read_audio(target)
         elif tool == "read_excel":
@@ -288,7 +305,7 @@ class BasicAgent:
                     pass
         memory = pre_context
         system_prompt = """You are a precise GAIA benchmark solver.
     STRICT OUTPUT FORMAT - choose exactly one:
@@ -317,19 +334,24 @@ class BasicAgent:
       https://en.wikipedia.org/wiki/1928_Summer_Olympics
     - Malko Competition:
       https://en.wikipedia.org/wiki/Malko_Competition
-    - Wikipedia Featured articles November 2016:
-      https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_that_were_promoted_in_November_2016
     - 1977 New York Yankees season stats:
       https://en.wikipedia.org/wiki/1977_New_York_Yankees_season
     - Taishō Tamai (baseball):
       https://en.wikipedia.org/wiki/Taish%C5%8D_Tamai
-    - Kochanie, mam problem (Polish Everybody Loves Raymond):
-      https://en.wikipedia.org/wiki/Kochanie,_mam_problem
-    - Universe Today Carolyn Collins Petersen June 2023:
-      https://www.universetoday.com/161812/
     FACTS YOU KNOW (no tools needed):
     - Reversed text questions: decode then answer directly as FINAL
     - Basic math/logic: reason step by step then answer as FINAL
     - Botanical vegetables: only plant parts with NO seeds inside count as vegetables.
       Exclude: tomato, pepper, corn, zucchini, green beans, peas, cucumber, squash, acorns, peanuts.
@@ -376,8 +398,14 @@ class BasicAgent:
                 elif t_match.group(2):
                     tool_input = t_match.group(2).strip()
                 else:
-                    tool_input = ""
                 # ── CALL THE TOOL AND UPDATE MEMORY ──
                 result = self.execute_tool(tool_name, tool_input, file_url)
                 print(f"  [{tool_name}] → {result[:100]}")
@@ -472,20 +500,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             file_name = item.get("file_name", "")
             task_id = item.get("task_id", "")
             if file_name:
-                # Try the direct file_name URL first
                 file_url = f"https://agents-course-unit4-scoring.hf.space/files/{file_name}"
-                # Verify it exists
                 try:
-                    test = requests.head(file_url, timeout=5)
-                    if test.status_code == 404:
-                        # Try with task_id prefix
-                        file_url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}/{file_name}"
-                        test2 = requests.head(file_url, timeout=5)
-                        if test2.status_code == 404:
-                            file_url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
-                    print(f"  [FILE] name='{file_name}', verified_url={file_url} status={test.status_code}")
-                except:
-                    pass
             else:
                 file_url = None
             print(f"  [FILE] name={file_name!r}, url={file_url}")

             return f"Search error: {e}"
     # ── TOOL: Scrape web page ─────────────────────────────────────────
+    def scrape_page(self, url, search_terms=None):
         url = url.strip(' "')
         if "youtube.com" in url or "youtu.be" in url:
             return "YouTube cannot be scraped directly."
         try:
             headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0 Safari/537.36"}
             resp = requests.get(url, timeout=15, headers=headers)
             soup = BeautifulSoup(resp.text, "html.parser")
             for tag in soup(["script", "style", "nav", "footer", "header"]):
                 tag.decompose()
+            full_text = soup.get_text(separator=" ", strip=True)
+            if len(full_text) < 100:
                 return f"Page returned too little content (status {resp.status_code})"
+            # If search terms provided, find the most relevant 4000-char window
+            if search_terms:
+                terms = search_terms.lower().split()
+                best_pos = 0
+                best_score = 0
+                # Slide a window and find the chunk with most term matches
+                window = 3000
+                for pos in range(0, len(full_text) - window, 500):
+                    chunk = full_text[pos:pos+window].lower()
+                    score = sum(chunk.count(t) for t in terms)
+                    if score > best_score:
+                        best_score = score
+                        best_pos = pos
+                relevant = full_text[max(0, best_pos-200):best_pos+window]
+                return f"PAGE (relevant section): {relevant}"
+            # Default: return first 8000 chars
+            return f"PAGE: {full_text[:8000]}"
         except Exception as e:
             return f"Scrape error: {e}"
         if tool == "wiki_search":
             return self.wiki_search(input_data)
         elif tool == "scrape_page":
+        # Extract key terms from tool_input or use question words
+            return self.scrape_page(target, search_terms=input_data)
         elif tool == "read_audio":
             return self.read_audio(target)
         elif tool == "read_excel":
                     pass
         memory = pre_context
+        seen_tool_calls = set()
         system_prompt = """You are a precise GAIA benchmark solver.
     STRICT OUTPUT FORMAT - choose exactly one:
       https://en.wikipedia.org/wiki/1928_Summer_Olympics
     - Malko Competition:
       https://en.wikipedia.org/wiki/Malko_Competition
     - 1977 New York Yankees season stats:
       https://en.wikipedia.org/wiki/1977_New_York_Yankees_season
     - Taishō Tamai (baseball):
       https://en.wikipedia.org/wiki/Taish%C5%8D_Tamai
+        - Wikipedia Featured articles November 2016:
+          https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log/November_2016
+        - Universe Today Carolyn Collins Petersen June 2023:
+          https://web.archive.org/web/2023/https://www.universetoday.com/161812/
+        - Polish Everybody Loves Raymond (Świat według Kiepskich):
+          https://en.wikipedia.org/wiki/Wszyscy_kochaj%C4%85_Raymonda_(Polish_TV_series)
+        - Mercedes Sosa discography (use main article not redirect):
+          https://en.wikipedia.org/wiki/Mercedes_Sosa
     FACTS YOU KNOW (no tools needed):
     - Reversed text questions: decode then answer directly as FINAL
+    - When asked for "first name only", return ONLY the first word of the name
+    - When asked for "surname only", return ONLY the last word
     - Basic math/logic: reason step by step then answer as FINAL
     - Botanical vegetables: only plant parts with NO seeds inside count as vegetables.
       Exclude: tomato, pepper, corn, zucchini, green beans, peas, cucumber, squash, acorns, peanuts.
                 elif t_match.group(2):
                     tool_input = t_match.group(2).strip()
                 else:
+                        tool_input = ""
+                    # ── SKIP duplicate tool calls ──
+                call_key = f"{tool_name}:{tool_input[:80]}"
+                if call_key in seen_tool_calls:
+                    memory += f"\n\n[Step {step} - DUPLICATE SKIPPED: {call_key}. You already tried this. Use a DIFFERENT URL or approach.]"
+                    print(f"  [DUPLICATE SKIPPED] {call_key}")
+                    continue
+                seen_tool_calls.add(call_key)
                 # ── CALL THE TOOL AND UPDATE MEMORY ──
                 result = self.execute_tool(tool_name, tool_input, file_url)
                 print(f"  [{tool_name}] → {result[:100]}")
             file_name = item.get("file_name", "")
             task_id = item.get("task_id", "")
             if file_name:
                 file_url = f"https://agents-course-unit4-scoring.hf.space/files/{file_name}"
                 try:
+                    # Use GET with stream instead of HEAD (HEAD returns 405)
+                    test = requests.get(file_url, timeout=5, stream=True)
+                    test.close()
+                    print(f"  [FILE] name='{file_name}', url={file_url}, status={test.status_code}")
+                except Exception as e:
+                    print(f"  [FILE] verification error: {e}")
             else:
                 file_url = None
             print(f"  [FILE] name={file_name!r}, url={file_url}")