Spaces:

ZZZyx3587
/

ResearchRadar

Running

App Files Files Community

ZZZyx3587 commited on 3 days ago

Commit

e2b4dbb

verified ·

1 Parent(s): 8b70b45

Upload repo_searcher.py with huggingface_hub

Browse files

Files changed (1) hide show

repo_searcher.py +77 -8

repo_searcher.py CHANGED Viewed

@@ -1,9 +1,26 @@
 import requests
 import os
 def search_repos(queries: list[str], max_per_keyword: int = 5) -> list[dict]:
-    """在 GitHub 搜索仓库
     Args:
         queries: 搜索查询字符串列表
@@ -11,38 +28,80 @@ def search_repos(queries: list[str], max_per_keyword: int = 5) -> list[dict]:
     Returns:
         list[dict]: 已去重（按 full_name）、按 stars 降序排列的候选仓库列表。
-        每个 dict 包含: full_name, html_url, description, stars,
-        language, updated_at, topics, match_keyword
     """
     token = os.getenv("GITHUB_TOKEN", "")
     headers = {"Accept": "application/vnd.github.v3+json"}
     if token:
         headers["Authorization"] = f"Bearer {token}"
     seen = set()
     results = []
-    for kw in queries:
         query = f"{kw} language:python stars:>=5"
         url = "https://api.github.com/search/repositories"
         params = {"q": query, "sort": "stars", "order": "desc", "per_page": max_per_keyword}
         try:
             resp = requests.get(url, headers=headers, params=params, timeout=15)
             if resp.status_code == 403 and "rate limit" in resp.text.lower():
-                print(f"[警告] GitHub 搜索限速，跳过关键词: {kw}")
                 continue
             resp.raise_for_status()
             data = resp.json()
         except Exception as e:
-            print(f"[警告] 搜索关键词 '{kw}' 失败: {e}")
             continue
         for item in data.get("items", []):
             full_name = item["full_name"]
             if full_name not in seen:
                 seen.add(full_name)
-                results.append({
                     "full_name": full_name,
                     "html_url": item["html_url"],
                     "description": item.get("description", ""),
@@ -51,7 +110,17 @@ def search_repos(queries: list[str], max_per_keyword: int = 5) -> list[dict]:
                     "updated_at": item.get("updated_at", ""),
                     "topics": item.get("topics", []),
                     "match_keyword": kw,
-                })
     results.sort(key=lambda r: r["stars"], reverse=True)
     return results

 import requests
 import os
+import time
+# GitHub API 限速参数
+RATE_LIMIT_DELAY = 7.0        # 未认证时每个搜索词之间的间隔（秒），确保 10 req/min
+RATE_LIMIT_DELAY_AUTH = 2.5   # 已认证时每个搜索词之间的间隔（秒），确保 30 req/min
+def _check_rate_limit(headers: dict) -> int:
+    """检查剩余 API 次数，返回 remaining。失败返回 -1。"""
+    try:
+        resp = requests.get("https://api.github.com/rate_limit", headers=headers, timeout=10)
+        if resp.status_code == 200:
+            data = resp.json()
+            return data.get("resources", {}).get("search", {}).get("remaining", -1)
+    except Exception:
+        pass
+    return -1
 def search_repos(queries: list[str], max_per_keyword: int = 5) -> list[dict]:
+    """在 GitHub 搜索仓库（限速感知：未认证 10 次/分，已认证 30 次/分）
     Args:
         queries: 搜索查询字符串列表
     Returns:
         list[dict]: 已去重（按 full_name）、按 stars 降序排列的候选仓库列表。
     """
     token = os.getenv("GITHUB_TOKEN", "")
     headers = {"Accept": "application/vnd.github.v3+json"}
     if token:
         headers["Authorization"] = f"Bearer {token}"
+        delay = RATE_LIMIT_DELAY_AUTH
+    else:
+        delay = RATE_LIMIT_DELAY
+    # 未认证时限制搜索词数量，避免全部被限速跳过
+    max_queries = len(queries) if token else min(len(queries), 8)
+    queries = queries[:max_queries]
     seen = set()
     results = []
+    skipped = 0
+    from cache import github_search_cache
+    cache_hits = 0
+    for i, kw in enumerate(queries):
+        # 先查缓存
+        cache_key = f"{kw}:{max_per_keyword}"
+        cached = github_search_cache.get(cache_key)
+        if cached is not None:
+            cache_hits += 1
+            for item in cached:
+                full_name = item["full_name"]
+                if full_name not in seen:
+                    seen.add(full_name)
+                    results.append(item)
+            continue
+        # 限速感知：在请求之间加入延迟
+        if i > 0:
+            time.sleep(delay)
         query = f"{kw} language:python stars:>=5"
         url = "https://api.github.com/search/repositories"
         params = {"q": query, "sort": "stars", "order": "desc", "per_page": max_per_keyword}
         try:
             resp = requests.get(url, headers=headers, params=params, timeout=15)
+            # 检查限速头
+            remaining = int(resp.headers.get("X-RateLimit-Remaining", -1))
+            if remaining == 0:
+                reset_time = int(resp.headers.get("X-RateLimit-Reset", 0))
+                wait = max(reset_time - time.time(), 0) + 2
+                print(f"[限速] GitHub 搜索额度用尽，等待 {wait:.0f}s...")
+                if wait < 90:
+                    time.sleep(wait)
+                    resp = requests.get(url, headers=headers, params=params, timeout=15)
+                else:
+                    print(f"[警告] 等待时间过长，跳过剩余 {len(queries) - i} 个搜索词")
+                    break
             if resp.status_code == 403 and "rate limit" in resp.text.lower():
+                skipped += 1
+                if skipped <= 2:
+                    print(f"[警告] GitHub 搜索限速，跳过: {kw[:40]}...")
                 continue
             resp.raise_for_status()
             data = resp.json()
         except Exception as e:
+            print(f"[警告] 搜索 '{kw[:40]}...' 失败: {e}")
             continue
+        kw_results = []
         for item in data.get("items", []):
             full_name = item["full_name"]
             if full_name not in seen:
                 seen.add(full_name)
+                repo = {
                     "full_name": full_name,
                     "html_url": item["html_url"],
                     "description": item.get("description", ""),
                     "updated_at": item.get("updated_at", ""),
                     "topics": item.get("topics", []),
                     "match_keyword": kw,
+                }
+                kw_results.append(repo)
+                results.append(repo)
+        # 缓存该搜索词的结果
+        if kw_results:
+            github_search_cache.set(cache_key, kw_results)
+    if cache_hits:
+        print(f"  [缓存] 命中 {cache_hits}/{len(queries)} 个搜索词")
+    if skipped:
+        print(f"[限速] 共跳过 {skipped} 个搜索词（GitHub API 额度不足）")
     results.sort(key=lambda r: r["stars"], reverse=True)
     return results