Spaces:

ZZZyx3587
/

ResearchRadar

Running

App Files Files Community

ZZZyx3587 commited on 22 days ago

Commit

151e05c

verified ·

1 Parent(s): 4c14732

Upload run.py with huggingface_hub

Browse files

Files changed (1) hide show

run.py +99 -1

run.py CHANGED Viewed

@@ -259,6 +259,83 @@ def _search_s2_papers(query: str, limit: int = 5) -> list[dict]:
     return papers
 def run(arxiv_url: str, top_n: int = 5, progress=None) -> dict:
     """主入口：先搜索 GitHub → 再基于实际仓库归纳方法族 → 最后评估。
@@ -294,6 +371,20 @@ def run(arxiv_url: str, top_n: int = 5, progress=None) -> dict:
     print(f"  分类: {', '.join(paper.get('categories', []))}")
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
     # [1.5] 对比实验算法挖掘（从 S2 引用网络提取算法名）
     # ================================================================
@@ -443,9 +534,16 @@ def run(arxiv_url: str, top_n: int = 5, progress=None) -> dict:
     classified.sort(key=lambda r: r.get("stars", 0), reverse=True)
     unclassified.sort(key=lambda r: r.get("stars", 0), reverse=True)
-    candidates = (classified + unclassified)[:top_n]
     print(f"  有方法族归属: {len(classified)} 个，未归类: {len(unclassified)} 个")
     print(f"  最终选取 {len(candidates)} 个仓库:")
     for i, c in enumerate(candidates):
         family = repo_family_map.get(c["full_name"], "未归类")

     return papers
+def _find_input_paper_repo(title: str, arxiv_id: str, authors: list[str]) -> dict | None:
+    """搜索输入论文自身的官方代码仓库。
+    搜索策略（按优先级）：
+    1. GitHub 搜索 arxiv ID
+    2. GitHub 搜索论文标题（精确匹配）
+    3. GitHub 搜索一作姓名 + 论文核心关键词
+    Returns:
+        找到则返回符合格式的候选仓库 dict，找不到返回 None
+    """
+    import requests
+    token = os.getenv("GITHUB_TOKEN", "")
+    headers = {"Accept": "application/vnd.github.v3+json"}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    # 提取核心关键词（取标题前 6 个实义词）
+    stops = {
+        'a', 'an', 'the', 'of', 'for', 'in', 'on', 'to', 'and', 'or', 'is', 'are',
+        'we', 'our', 'that', 'this', 'with', 'from', 'by', 'as', 'at', 'be', 'it',
+        'its', 'not', 'can', 'has', 'have', 'been', 'was', 'were', 'will', 'would',
+        'could', 'should', 'may', 'do', 'does', 'did', 'so', 'if', 'no', 'new',
+        'based', 'using', 'which', 'into', 'such', 'than', 'then', 'these', 'those',
+        'propose', 'present', 'method', 'approach', 'framework', 'novel', 'model',
+        'learning', 'deep', 'via', 'et', 'al', 'towards', 'toward',
+    }
+    title_words = [w for w in re.sub(r'[^\w\s-]', ' ', title).split()
+                   if w.lower() not in stops and len(w) >= 3]
+    core_keywords = " ".join(title_words[:6])
+    search_queries = [
+        f'"{arxiv_id}" in:name,description,readme',
+        f'"{core_keywords}" in:name,description stars:>=3',
+    ]
+    # 用一作姓名 + 关键词搜索
+    first_author = authors[0] if authors else ""
+    if first_author:
+        last_name = first_author.split()[-1] if first_author.split() else ""
+        if len(last_name) >= 3:
+            search_queries.append(f"{last_name} {core_keywords[:80]} in:name,description")
+    for query in search_queries:
+        url = "https://api.github.com/search/repositories"
+        params = {"q": query, "sort": "stars", "order": "desc", "per_page": 5}
+        try:
+            resp = requests.get(url, headers=headers, params=params, timeout=15)
+            if resp.status_code in (403, 429):
+                continue
+            resp.raise_for_status()
+            data = resp.json()
+        except Exception:
+            continue
+        # 筛选：标题必须包含核心关键词中至少 2 个
+        for item in data.get("items", []):
+            repo_title = (item.get("description") or "").lower()
+            repo_name = item.get("full_name", "").lower()
+            combined = f"{repo_name} {repo_title}"
+            matches = sum(1 for kw in title_words[:6] if kw.lower() in combined)
+            if matches >= 2:
+                return {
+                    "full_name": item["full_name"],
+                    "html_url": item["html_url"],
+                    "description": item.get("description", ""),
+                    "stars": item.get("stargazers_count", 0),
+                    "language": item.get("language", ""),
+                    "updated_at": item.get("updated_at", ""),
+                    "topics": item.get("topics", []),
+                    "match_keyword": f"本文代码: {query[:60]}",
+                }
+    return None
 def run(arxiv_url: str, top_n: int = 5, progress=None) -> dict:
     """主入口：先搜索 GitHub → 再基于实际仓库归纳方法族 → 最后评估。
     print(f"  分类: {', '.join(paper.get('categories', []))}")
     print(f"  耗时: {time.time() - t0:.1f}s")
+    # ================================================================
+    # [1.2] 搜索输入论文的官方代码
+    # ================================================================
+    print()
+    print("[1.2] 正在搜索输入论文的官方代码...")
+    _prog(0.10, "正在搜索论文自身的官方代码...")
+    t0 = time.time()
+    input_paper_repo = _find_input_paper_repo(title, arxiv_id, paper.get("authors", []))
+    if input_paper_repo:
+        print(f"  找到论文官方代码: {input_paper_repo['full_name']} (Stars: {input_paper_repo.get('stars', 0)})")
+    else:
+        print(f"  未找到论文官方代码")
+    print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
     # [1.5] 对比实验算法挖掘（从 S2 引用网络提取算法名）
     # ================================================================
     classified.sort(key=lambda r: r.get("stars", 0), reverse=True)
     unclassified.sort(key=lambda r: r.get("stars", 0), reverse=True)
+    # 论文自身代码优先排在最前面
+    if input_paper_repo:
+        candidates = [input_paper_repo] + (classified + unclassified)[:top_n - 1]
+        repo_family_map[input_paper_repo["full_name"]] = "本文代码"
+    else:
+        candidates = (classified + unclassified)[:top_n]
     print(f"  有方法族归属: {len(classified)} 个，未归类: {len(unclassified)} 个")
+    if input_paper_repo:
+        print(f"  含论文自身代码: {input_paper_repo['full_name']}")
     print(f"  最终选取 {len(candidates)} 个仓库:")
     for i, c in enumerate(candidates):
         family = repo_family_map.get(c["full_name"], "未归类")