Spaces:

ZZZyx3587
/

ResearchRadar

Running

App Files Files Community

ZZZyx3587 commited on 3 days ago

Commit

fa671bd

verified ·

1 Parent(s): 288ba88

Upload run.py with huggingface_hub

Browse files

Files changed (1) hide show

run.py +202 -4

run.py CHANGED Viewed

@@ -9,6 +9,10 @@ import sys
 import time
 import re
 import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # 自动加载 .env 文件
@@ -52,6 +56,140 @@ from repo_evaluator import evaluate_repo
 from llm_utils import call_llm_json, parse_json_safe
 def run(arxiv_url: str, top_n: int = 5) -> dict:
     """主入口：先搜索 GitHub → 再基于实际仓库归纳方法族 → 最后评估。
@@ -88,6 +226,20 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     print(f"  分类: {', '.join(paper.get('categories', []))}")
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
     # [2/5] 宽泛 GitHub 搜索（甲 Workflow）
     # ================================================================
@@ -96,12 +248,14 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     t0 = time.time()
     broad_queries = _extract_broad_queries(title, abstract)
-    print(f"  生成 {len(broad_queries)} 个宽泛搜索词:")
-    for q in broad_queries:
         print(f"    - {q}")
     try:
-        broad_results = search_repos(broad_queries, max_per_keyword=5)
     except Exception as e:
         return {
             "paper": paper,
@@ -128,6 +282,19 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     filtered_results = _filter_repos(title, abstract, broad_results)
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
     # [3/5] 基于仓库数据归纳方法族（Agent 1）  ← 核心改动
     # ================================================================
@@ -136,7 +303,7 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     t0 = time.time()
     try:
-        direction = analyze_direction(title, abstract, filtered_results)
     except Exception as e:
         return {
             "paper": paper,
@@ -151,6 +318,9 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     for mf in families:
         matched = mf.get("matched_repos", [])
         print(f"    - {mf.get('family_name', '?')}: {len(matched)} 个仓库 {matched}")
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
@@ -435,6 +605,34 @@ def _filter_repos(title: str, abstract: str, repos: list[dict]) -> list[dict]:
     return filtered
 def _make_error_evaluation(error_msg: str) -> dict:
     """构造一个表示评估失败的 evaluation dict"""
     return {

 import time
 import re
 import os
+import json
+import urllib.request
+import urllib.error
+import xml.etree.ElementTree as ET
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # 自动加载 .env 文件
 from llm_utils import call_llm_json, parse_json_safe
+def _enrich_domain_context(title: str, abstract: str, categories: list[str]) -> str:
+    """从 arxiv 搜索同领域的综述和相关高引论文，扩充领域上下文。
+    当输入论文不是综述时，普通研究论文的摘要不足以让 Agent 1 推断领域全景。
+    此函数从 arxiv 搜索相关综述/survey 论文，提取摘要作为补充上下文。
+    失败时返回空字符串（静默降级）。
+    """
+    if not categories:
+        return ""
+    primary_cat = categories[0]
+    # 用主要分类 + 关键词搜索综述/survey 论文
+    keywords = ["survey", "review", "comprehensive", "benchmark"]
+    all_abstracts = []
+    for kw in keywords[:2]:  # 只搜两个关键词，避免触发 arxiv 限速
+        search_query = f"cat:{primary_cat}+AND+ti:{kw}"
+        api_url = (
+            f"http://export.arxiv.org/api/query"
+            f"?search_query={search_query}&sortBy=relevance&max_results=2"
+        )
+        req = urllib.request.Request(api_url, headers={"User-Agent": "ResearchRadar/1.0"})
+        try:
+            with urllib.request.urlopen(req, timeout=15) as resp:
+                xml_text = resp.read().decode("utf-8")
+            root = ET.fromstring(xml_text)
+            ns = {"atom": "http://www.w3.org/2005/Atom"}
+            for entry in root.findall("atom:entry", ns):
+                t = entry.find("atom:title", ns)
+                s = entry.find("atom:summary", ns)
+                if t is not None and t.text and s is not None and s.text:
+                    all_abstracts.append({
+                        "title": t.text.strip().replace("\n", " "),
+                        "abstract": s.text.strip().replace("\n", " ")[:800],
+                    })
+        except Exception:
+            continue
+    if not all_abstracts:
+        return ""
+    # 组装为上下文文本
+    lines = ["## 该领域的相关综述/调查论文（供参考领域全景）"]
+    for i, a in enumerate(all_abstracts[:4]):
+        lines.append(f"{i+1}. **{a['title']}**: {a['abstract']}")
+    return "\n\n".join(lines)
+def _mine_comparison_algorithms(arxiv_id: str, title: str, abstract: str) -> tuple[list[str], dict[str, int]]:
+    """从 Semantic Scholar 引用网络中挖掘对比实验算法。
+    核心思路：论文的 references（引用的论文）中通常包含对比实验的 baseline 方法，
+    通过提取这些论文的标题作为额外搜索词，可以大幅提升 GitHub 搜索的覆盖度。
+    Returns:
+        (extra_queries, citation_map): 额外搜索词列表, {paper_title: citation_count}
+    """
+    s2_url = (
+        f"https://api.semanticscholar.org/graph/v1/paper/ArXiv:{arxiv_id}"
+        f"?fields=references.title,references.citationCount,references.abstract"
+        f"&limit=50"
+    )
+    req = urllib.request.Request(s2_url, headers={"User-Agent": "ResearchRadar/1.0"})
+    try:
+        with urllib.request.urlopen(req, timeout=20) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+    except Exception as e:
+        print(f"  [WARN] Semantic Scholar API 不可用，跳过对比算法挖掘: {e}")
+        return [], {}
+    refs = data.get("references", [])
+    if not refs:
+        return [], {}
+    # 按引用量排序，取 top 15
+    refs.sort(key=lambda r: r.get("citationCount", 0), reverse=True)
+    top_refs = refs[:15]
+    # 构建标题列表供 LLM 识别方法论文
+    title_list = []
+    citation_map = {}
+    for r in top_refs:
+        t = (r.get("title") or "").strip()
+        cc = r.get("citationCount", 0)
+        if t and len(t) > 10:
+            title_list.append(f"- [{cc} cites] {t}")
+            citation_map[t] = cc
+    if len(title_list) < 3:
+        return [], citation_map
+    # 用 LLM 从引用论文标题中识别哪些是方法/算法论文
+    system_prompt = """你是学术论文分析专家。从引用论文列表中识别哪些是提出了具体算法/方法的论文。
+排除标准：
+- 数据集/benchmark 论文（如 ImageNet, CIFAR, MVTec AD）
+- 综述/survey 论文
+- 纯理论/数学论文
+- 框架/库论文（如 PyTorch, TensorFlow）
+保留标准：
+- 提出了具体的模型/架构/算法名称
+- 可以作为对比实验的 baseline 方法
+输出严格 JSON：
+{"methods": ["方法名1", "方法名2"], "search_queries": ["method1 pytorch implementation", "method2 official code"]}
+方法名尽量使用论文中常用的英文缩写或全称。"""
+    user_prompt = f"""输入论文标题: {title[:200]}
+引用论文列表：
+{chr(10).join(title_list)}
+请识别哪些是方法/算法论文，生成对应的 GitHub 搜索词。"""
+    try:
+        raw = call_llm_json(system_prompt, user_prompt, temperature=0.2, max_tokens=800)
+        data = parse_json_safe(raw, "comparison_miner")
+        methods = data.get("methods", [])
+        queries = data.get("search_queries", [])
+        print(f"  从引用网络识别到 {len(methods)} 个对比算法: {methods[:8]}")
+        return queries[:8], citation_map
+    except Exception as e:
+        print(f"  [WARN] 对比算法识别失败，降级: {e}")
+        # 降级：直接用引用论文标题作为搜索词
+        fallback_queries = []
+        for t in list(citation_map.keys())[:5]:
+            short = re.sub(r'[^\w\s-]', '', t).strip()[:80]
+            if len(short) > 15:
+                fallback_queries.append(f"{short} pytorch implementation")
+        return fallback_queries, citation_map
 def run(arxiv_url: str, top_n: int = 5) -> dict:
     """主入口：先搜索 GitHub → 再基于实际仓库归纳方法族 → 最后评估。
     print(f"  分类: {', '.join(paper.get('categories', []))}")
     print(f"  耗时: {time.time() - t0:.1f}s")
+    # ================================================================
+    # [1.5] 对比实验算法挖掘（从 S2 引用网络提取算法名）
+    # ================================================================
+    print()
+    print("[1.5] 正在从论文引用网络挖掘对比实验算法...")
+    t0 = time.time()
+    arxiv_id = paper.get("arxiv_id", "")
+    comparison_queries, citation_map = _mine_comparison_algorithms(arxiv_id, title, abstract)
+    if comparison_queries:
+        print(f"  生成 {len(comparison_queries)} 个对比算法搜索词:")
+        for q in comparison_queries[:5]:
+            print(f"    - {q}")
+    print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
     # [2/5] 宽泛 GitHub 搜索（甲 Workflow）
     # ================================================================
     t0 = time.time()
     broad_queries = _extract_broad_queries(title, abstract)
+    # 合并对比算法搜索词，去重
+    all_queries = list(dict.fromkeys(comparison_queries + broad_queries))[:12]
+    print(f"  生成 {len(broad_queries)} 个宽泛搜索词 + {len(comparison_queries)} 个对比算法搜索词 = {len(all_queries)} 个总搜索词:")
+    for q in all_queries:
         print(f"    - {q}")
     try:
+        broad_results = search_repos(all_queries, max_per_keyword=5)
     except Exception as e:
         return {
             "paper": paper,
     filtered_results = _filter_repos(title, abstract, broad_results)
     print(f"  耗时: {time.time() - t0:.1f}s")
+    # ================================================================
+    # [2.6] 领域上下文扩充（从 arxiv 搜索综述论文补充领域知识）
+    # ================================================================
+    print()
+    print("[2.6] 正在扩充领域上下文（搜索相关综述论文）...")
+    t0 = time.time()
+    domain_context = _enrich_domain_context(title, abstract, paper.get("categories", []))
+    if domain_context:
+        print(f"  获取到 {domain_context.count('**') // 2} 篇相关综述的摘要")
+    else:
+        print(f"  未找到相关综述（将仅基于论文摘要和仓库数据进行分析）")
+    print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
     # [3/5] 基于仓库数据归纳方法族（Agent 1）  ← 核心改动
     # ================================================================
     t0 = time.time()
     try:
+        direction = analyze_direction(title, abstract, filtered_results, domain_context)
     except Exception as e:
         return {
             "paper": paper,
     for mf in families:
         matched = mf.get("matched_repos", [])
         print(f"    - {mf.get('family_name', '?')}: {len(matched)} 个仓库 {matched}")
+    # 防混淆校验：检查子领域是否与论文标题存在语义关联
+    _sanity_check_direction(title, subfield)
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
     return filtered
+def _sanity_check_direction(title: str, subfield: str) -> None:
+    """防混淆检查：验证子领域分析是否与论文标题存在最低限度的语义��联。
+    如果子领域关键词与标题完全无关，可能是 LLM 混淆了论文。
+    仅打印警告，不阻断流程。
+    """
+    # 提取标题中的实义词（长度>=4，排除停用词）
+    title_stops = {
+        'a', 'an', 'the', 'of', 'for', 'in', 'on', 'to', 'and', 'or', 'is', 'are',
+        'we', 'our', 'that', 'this', 'with', 'from', 'by', 'as', 'at', 'be', 'it',
+        'its', 'not', 'can', 'has', 'have', 'been', 'was', 'were', 'will', 'would',
+        'could', 'should', 'may', 'do', 'does', 'did', 'so', 'if', 'no', 'new',
+        'based', 'using', 'which', 'into', 'such', 'than', 'then', 'these', 'those',
+        'propose', 'present', 'method', 'approach', 'framework', 'novel', 'model',
+        'learning', 'deep', 'via', 'et', 'al',
+    }
+    title_words = set(
+        w.lower() for w in re.sub(r'[^\w\s]', ' ', title).split()
+        if len(w) >= 4 and w.lower() not in title_stops
+    )
+    subfield_lower = subfield.lower()
+    overlap = [w for w in title_words if w in subfield_lower]
+    if not overlap and title_words:
+        print(f"  ⚠️ [防混淆警告] 子领域\"{subfield}\"与论文标题无关键词重叠")
+        print(f"     标题关键词: {sorted(title_words)[:10]}")
+        print(f"     这可能是 LLM 混淆了论文，请人工核实分析结果。")
 def _make_error_evaluation(error_msg: str) -> dict:
     """构造一个表示评估失败的 evaluation dict"""
     return {