Spaces:

ZZZyx3587
/

ResearchRadar

Running

App Files Files Community

ZZZyx3587 commited on 19 days ago

Commit

097cbfc

verified ·

1 Parent(s): 3b1ebc4

Upload run.py with huggingface_hub

Browse files

Files changed (1) hide show

run.py +27 -18

run.py CHANGED Viewed

@@ -251,29 +251,28 @@ def _search_s2_papers(query: str, limit: int = 5) -> list[dict]:
     return papers
-def run(arxiv_url: str, top_n: int = 5) -> dict:
     """主入口：先搜索 GitHub → 再基于实际仓库归纳方法族 → 最后评估。
-    新流程（方案 B）：
-      第一层：论文信息获取 (Workflow)
-      第二层：宽泛 GitHub 搜索 (Workflow)
-      第三层：基于仓库数据归纳方法族 (Agent 1)  ← 不再是 LLM 凭记忆
-      第四层：仓库评估 (Agent 2)
     Args:
         arxiv_url: arxiv 论文 URL
         top_n: 最终评估的仓库数量
     Returns:
         dict: {"paper": {...}, "direction": {...}, "repos": [{...}], "error": "..."}
     """
     t_start = time.time()
     # ================================================================
-    # [1/5] 论文信息获取（甲 Workflow）
     # ================================================================
     print("=" * 60)
-    print("[1/5] 正在获取论文信息...")
     t0 = time.time()
     try:
         paper = fetch_paper_info(arxiv_url)
@@ -292,6 +291,7 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     # ================================================================
     print()
     print("[1.5] 正在从论文引用网络挖掘对比实验算法...")
     t0 = time.time()
     arxiv_id = paper.get("arxiv_id", "")
     comparison_queries, citation_map = _mine_comparison_algorithms(arxiv_id, title, abstract)
@@ -306,6 +306,7 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     # ================================================================
     print()
     print("[1.8] 正在 Semantic Scholar 中搜索同领域论文（覆盖全工科）...")
     t0 = time.time()
     title_kws = _extract_title_keywords(title)
     s2_papers = _search_s2_papers(" ".join(title_kws), limit=8)
@@ -327,10 +328,11 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
-    # [2/5] 宽泛 GitHub 搜索（甲 Workflow）
     # ================================================================
     print()
-    print("[2/5] 正在宽泛搜索 GitHub（先搜仓库，再让 Agent 分析）...")
     t0 = time.time()
     broad_queries = _extract_broad_queries(title, abstract)
@@ -364,15 +366,17 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     # ================================================================
     print()
     print("[2.5] 正在用 LLM 过滤不相关仓库...")
     t0 = time.time()
     filtered_results = _filter_repos(title, abstract, broad_results)
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
-    # [2.6] 领域上下文扩充（从 arxiv 搜索综述论文补充领域知识）
     # ================================================================
     print()
     print("[2.6] 正在扩充领域上下文（搜索相关综述论文）...")
     t0 = time.time()
     domain_context = _enrich_domain_context(title, abstract, paper.get("categories", []))
     if domain_context:
@@ -382,10 +386,11 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
-    # [3/5] 基于仓库数据归纳方法族（Agent 1）  ← 核心改动
     # ================================================================
     print()
-    print(f"[3/5] 正在分析 {len(filtered_results)} 个仓库，归纳方法族（Agent 1）...")
     t0 = time.time()
     try:
@@ -410,10 +415,11 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
-    # [4/5] 筛选仓库 + 构建方法族归属映射
     # ================================================================
     print()
-    print(f"[4/5] 正在筛选并获取仓库详情...")
     t0 = time.time()
     # 从 Agent 1 的 matched_repos 中建立 full_name → family_name 映射
@@ -439,10 +445,11 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
-    # [5/5] 仓库详情获取 + 评估（Agent 2，并行）
     # ================================================================
     print()
-    print(f"[5/5] 正在获取仓库详情并评估（Agent 2，{len(candidates)} 个仓库并行）...")
     t0 = time.time()
     def _eval_single(repo, idx, total):
@@ -508,6 +515,7 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     # ================================================================
     # 审核层：检查 Agent 输出质量和来源可靠性
     # ================================================================
     audit = supervise(title, abstract, direction, evaluated)
     print(f"\n  审核结果: {audit['summary']}")
     print(f"  综合质量评分: {audit['overall_score']}/100")
@@ -518,6 +526,7 @@ def run(arxiv_url: str, top_n: int = 5) -> dict:
     print(f"  总耗时: {time.time() - t_start:.1f}s")
     print("=" * 60)
     return {
         "paper": paper,
         "direction": direction,

     return papers
+def run(arxiv_url: str, top_n: int = 5, progress=None) -> dict:
     """主入口：先搜索 GitHub → 再基于实际仓库归纳方法族 → 最后评估。
     Args:
         arxiv_url: arxiv 论文 URL
         top_n: 最终评估的仓库数量
+        progress: gr.Progress 实例或 callable(fraction, desc)，用于前端进度条
     Returns:
         dict: {"paper": {...}, "direction": {...}, "repos": [{...}], "error": "..."}
     """
     t_start = time.time()
+    def _prog(frac: float, desc: str):
+        if progress:
+            progress(frac, desc=desc)
     # ================================================================
+    # [1/6] 论文信息获取（甲 Workflow）
     # ================================================================
     print("=" * 60)
+    print("[1/6] 正在获取论文信息...")
+    _prog(0.05, "正在从 arxiv 获取论文信息...")
     t0 = time.time()
     try:
         paper = fetch_paper_info(arxiv_url)
     # ================================================================
     print()
     print("[1.5] 正在从论文引用网络挖掘对比实验算法...")
+    _prog(0.12, "正在从引用网络挖掘对比算法...")
     t0 = time.time()
     arxiv_id = paper.get("arxiv_id", "")
     comparison_queries, citation_map = _mine_comparison_algorithms(arxiv_id, title, abstract)
     # ================================================================
     print()
     print("[1.8] 正在 Semantic Scholar 中搜索同领域论文（覆盖全工科）...")
+    _prog(0.18, "正在 Semantic Scholar 搜索同领域论文...")
     t0 = time.time()
     title_kws = _extract_title_keywords(title)
     s2_papers = _search_s2_papers(" ".join(title_kws), limit=8)
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
+    # [2/6] 宽泛 GitHub 搜索（甲 Workflow）
     # ================================================================
     print()
+    print("[2/6] 正在宽泛搜索 GitHub（先搜仓库，再让 Agent 分析）...")
+    _prog(0.22, "正在 GitHub 搜索开源仓库...")
     t0 = time.time()
     broad_queries = _extract_broad_queries(title, abstract)
     # ================================================================
     print()
     print("[2.5] 正在用 LLM 过滤不相关仓库...")
+    _prog(0.32, "正在用 LLM 过滤不相关仓库...")
     t0 = time.time()
     filtered_results = _filter_repos(title, abstract, broad_results)
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
+    # [2.6] 领域上下文扩充（S2 搜索综述论文补充领域知识）
     # ================================================================
     print()
     print("[2.6] 正在扩充领域上下文（搜索相关综述论文）...")
+    _prog(0.38, "正在扩充领域上下文...")
     t0 = time.time()
     domain_context = _enrich_domain_context(title, abstract, paper.get("categories", []))
     if domain_context:
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
+    # [3/6] 基于仓库数据归纳方法族（Agent 1）
     # ================================================================
     print()
+    print(f"[3/6] 正在分析 {len(filtered_results)} 个仓库，归纳方法族（Agent 1）...")
+    _prog(0.42, "正在用 LLM 归纳方法族...")
     t0 = time.time()
     try:
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
+    # [4/6] 筛选仓库 + 构建��法族归属映射
     # ================================================================
     print()
+    print(f"[4/6] 正在筛选并获取仓库详情...")
+    _prog(0.55, "正在筛选仓库并获取详情...")
     t0 = time.time()
     # 从 Agent 1 的 matched_repos 中建立 full_name → family_name 映射
     print(f"  耗时: {time.time() - t0:.1f}s")
     # ================================================================
+    # [5/6] 仓库详情获取 + 评估（Agent 2，并行）
     # ================================================================
     print()
+    print(f"[5/6] 正在获取仓库详情并评估（Agent 2，{len(candidates)} 个仓库并行）...")
+    _prog(0.60, f"正在评估 {len(candidates)} 个开源仓库...")
     t0 = time.time()
     def _eval_single(repo, idx, total):
     # ================================================================
     # 审核层：检查 Agent 输出质量和来源可靠性
     # ================================================================
+    _prog(0.90, "正在进行质量审核...")
     audit = supervise(title, abstract, direction, evaluated)
     print(f"\n  审核结果: {audit['summary']}")
     print(f"  综合质量评分: {audit['overall_score']}/100")
     print(f"  总耗时: {time.time() - t_start:.1f}s")
     print("=" * 60)
+    _prog(1.0, "分析完成，正在生成研报...")
     return {
         "paper": paper,
         "direction": direction,