ZZZyx3587 commited on
Commit
e2b4dbb
·
verified ·
1 Parent(s): 8b70b45

Upload repo_searcher.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. repo_searcher.py +77 -8
repo_searcher.py CHANGED
@@ -1,9 +1,26 @@
1
  import requests
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def search_repos(queries: list[str], max_per_keyword: int = 5) -> list[dict]:
6
- """在 GitHub 搜索仓库
7
 
8
  Args:
9
  queries: 搜索查询字符串列表
@@ -11,38 +28,80 @@ def search_repos(queries: list[str], max_per_keyword: int = 5) -> list[dict]:
11
 
12
  Returns:
13
  list[dict]: 已去重(按 full_name)、按 stars 降序排列的候选仓库列表。
14
- 每个 dict 包含: full_name, html_url, description, stars,
15
- language, updated_at, topics, match_keyword
16
  """
17
  token = os.getenv("GITHUB_TOKEN", "")
18
  headers = {"Accept": "application/vnd.github.v3+json"}
19
  if token:
20
  headers["Authorization"] = f"Bearer {token}"
 
 
 
 
 
 
 
21
 
22
  seen = set()
23
  results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- for kw in queries:
26
  query = f"{kw} language:python stars:>=5"
27
  url = "https://api.github.com/search/repositories"
28
  params = {"q": query, "sort": "stars", "order": "desc", "per_page": max_per_keyword}
29
 
30
  try:
31
  resp = requests.get(url, headers=headers, params=params, timeout=15)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  if resp.status_code == 403 and "rate limit" in resp.text.lower():
33
- print(f"[警告] GitHub 搜索限速,跳过关键词: {kw}")
 
 
34
  continue
35
  resp.raise_for_status()
36
  data = resp.json()
37
  except Exception as e:
38
- print(f"[警告] 搜索关键词 '{kw}' 失败: {e}")
39
  continue
40
 
 
41
  for item in data.get("items", []):
42
  full_name = item["full_name"]
43
  if full_name not in seen:
44
  seen.add(full_name)
45
- results.append({
46
  "full_name": full_name,
47
  "html_url": item["html_url"],
48
  "description": item.get("description", ""),
@@ -51,7 +110,17 @@ def search_repos(queries: list[str], max_per_keyword: int = 5) -> list[dict]:
51
  "updated_at": item.get("updated_at", ""),
52
  "topics": item.get("topics", []),
53
  "match_keyword": kw,
54
- })
 
 
 
 
 
 
 
 
 
 
55
 
56
  results.sort(key=lambda r: r["stars"], reverse=True)
57
  return results
 
1
  import requests
2
  import os
3
+ import time
4
+
5
+ # GitHub API 限速参数
6
+ RATE_LIMIT_DELAY = 7.0 # 未认证时每个搜索词之间的间隔(秒),确保 10 req/min
7
+ RATE_LIMIT_DELAY_AUTH = 2.5 # 已认证时每个搜索词之间的间隔(秒),确保 30 req/min
8
+
9
+
10
+ def _check_rate_limit(headers: dict) -> int:
11
+ """检查剩余 API 次数,返回 remaining。失败返回 -1。"""
12
+ try:
13
+ resp = requests.get("https://api.github.com/rate_limit", headers=headers, timeout=10)
14
+ if resp.status_code == 200:
15
+ data = resp.json()
16
+ return data.get("resources", {}).get("search", {}).get("remaining", -1)
17
+ except Exception:
18
+ pass
19
+ return -1
20
 
21
 
22
  def search_repos(queries: list[str], max_per_keyword: int = 5) -> list[dict]:
23
+ """在 GitHub 搜索仓库(限速感知:未认证 10 次/分,已认证 30 次/分)
24
 
25
  Args:
26
  queries: 搜索查询字符串列表
 
28
 
29
  Returns:
30
  list[dict]: 已去重(按 full_name)、按 stars 降序排列的候选仓库列表。
 
 
31
  """
32
  token = os.getenv("GITHUB_TOKEN", "")
33
  headers = {"Accept": "application/vnd.github.v3+json"}
34
  if token:
35
  headers["Authorization"] = f"Bearer {token}"
36
+ delay = RATE_LIMIT_DELAY_AUTH
37
+ else:
38
+ delay = RATE_LIMIT_DELAY
39
+
40
+ # 未认证时限制搜索词数量,避免全部被限速跳过
41
+ max_queries = len(queries) if token else min(len(queries), 8)
42
+ queries = queries[:max_queries]
43
 
44
  seen = set()
45
  results = []
46
+ skipped = 0
47
+
48
+ from cache import github_search_cache
49
+ cache_hits = 0
50
+
51
+ for i, kw in enumerate(queries):
52
+ # 先查缓存
53
+ cache_key = f"{kw}:{max_per_keyword}"
54
+ cached = github_search_cache.get(cache_key)
55
+ if cached is not None:
56
+ cache_hits += 1
57
+ for item in cached:
58
+ full_name = item["full_name"]
59
+ if full_name not in seen:
60
+ seen.add(full_name)
61
+ results.append(item)
62
+ continue
63
+
64
+ # 限速感知:在请求之间加入延迟
65
+ if i > 0:
66
+ time.sleep(delay)
67
 
 
68
  query = f"{kw} language:python stars:>=5"
69
  url = "https://api.github.com/search/repositories"
70
  params = {"q": query, "sort": "stars", "order": "desc", "per_page": max_per_keyword}
71
 
72
  try:
73
  resp = requests.get(url, headers=headers, params=params, timeout=15)
74
+
75
+ # 检查限速头
76
+ remaining = int(resp.headers.get("X-RateLimit-Remaining", -1))
77
+ if remaining == 0:
78
+ reset_time = int(resp.headers.get("X-RateLimit-Reset", 0))
79
+ wait = max(reset_time - time.time(), 0) + 2
80
+ print(f"[限速] GitHub 搜索额度用尽,等待 {wait:.0f}s...")
81
+ if wait < 90:
82
+ time.sleep(wait)
83
+ resp = requests.get(url, headers=headers, params=params, timeout=15)
84
+ else:
85
+ print(f"[警告] 等待时间过长,跳过剩余 {len(queries) - i} 个搜索词")
86
+ break
87
+
88
  if resp.status_code == 403 and "rate limit" in resp.text.lower():
89
+ skipped += 1
90
+ if skipped <= 2:
91
+ print(f"[警告] GitHub 搜索限速,跳过: {kw[:40]}...")
92
  continue
93
  resp.raise_for_status()
94
  data = resp.json()
95
  except Exception as e:
96
+ print(f"[警告] 搜索 '{kw[:40]}...' 失败: {e}")
97
  continue
98
 
99
+ kw_results = []
100
  for item in data.get("items", []):
101
  full_name = item["full_name"]
102
  if full_name not in seen:
103
  seen.add(full_name)
104
+ repo = {
105
  "full_name": full_name,
106
  "html_url": item["html_url"],
107
  "description": item.get("description", ""),
 
110
  "updated_at": item.get("updated_at", ""),
111
  "topics": item.get("topics", []),
112
  "match_keyword": kw,
113
+ }
114
+ kw_results.append(repo)
115
+ results.append(repo)
116
+ # 缓存该搜索词的结果
117
+ if kw_results:
118
+ github_search_cache.set(cache_key, kw_results)
119
+
120
+ if cache_hits:
121
+ print(f" [缓存] 命中 {cache_hits}/{len(queries)} 个搜索词")
122
+ if skipped:
123
+ print(f"[限速] 共跳过 {skipped} 个搜索词(GitHub API 额度不足)")
124
 
125
  results.sort(key=lambda r: r["stars"], reverse=True)
126
  return results