| """ |
| 柱 1:情境工程 — AGENTS.md 解析器 + 任務路由引擎 |
| ================================================ |
| |
| 漸進式情境披露(Progressive Disclosure)的核心元件。 |
| 根據任務描述的關鍵字,匹配 AGENTS.md 中的路由段落, |
| 回傳最相關的文件引用列表。 |
| |
| 層級邊界:L1(最底層)— 不可引用 harness.constraints 或 harness.entropy |
| """ |
|
|
| import re |
| import logging |
| from pathlib import Path |
| from dataclasses import dataclass, field |
|
|
| logger = logging.getLogger("threathunter.harness.context") |
|
|
|
|
| @dataclass |
| class DocReference: |
| """文件引用""" |
| path: str |
| section: str |
| description: str |
| relevance: float |
|
|
|
|
| @dataclass |
| class RouteSection: |
| """AGENTS.md 中的一個路由段落""" |
| title: str |
| keywords: list[str] = field(default_factory=list) |
| references: list[str] = field(default_factory=list) |
| description: str = "" |
|
|
|
|
| class AgentMap: |
| """ |
| AGENTS.md 解析器 + 任務路由引擎 |
| |
| 將 AGENTS.md 解析為結構化路由表, |
| 根據任務描述的關鍵字重疊度匹配, |
| 回傳最相關的文件引用。 |
| |
| 用法: |
| agent_map = AgentMap(project_root) |
| agent_map.load() |
| refs = agent_map.query_context("修復 memory_tool 的測試失敗") |
| """ |
|
|
| def __init__(self, project_root: Path): |
| self.project_root = project_root |
| self.agents_md_path = project_root / "AGENTS.md" |
| self.sections: list[RouteSection] = [] |
| self.global_rules: list[str] = [] |
| self._loaded = False |
|
|
| def load(self) -> bool: |
| """ |
| 載入並解析 AGENTS.md |
| |
| Returns: |
| 是否成功載入 |
| """ |
| if not self.agents_md_path.exists(): |
| logger.warning(f"AGENTS.md 不存在:{self.agents_md_path}") |
| return False |
|
|
| try: |
| content = self.agents_md_path.read_text(encoding="utf-8") |
| self._parse(content) |
| self._loaded = True |
| logger.info( |
| f"✅ AGENTS.md 已載入:" |
| f"{len(self.sections)} 個路由段落," |
| f"{len(self.global_rules)} 條全域規則" |
| ) |
| return True |
| except Exception as e: |
| logger.error(f"AGENTS.md 解析失敗:{e}") |
| return False |
|
|
| def _parse(self, content: str) -> None: |
| """解析 AGENTS.md 的 Markdown 結構""" |
| self.sections = [] |
| self.global_rules = [] |
|
|
| current_section: RouteSection | None = None |
| in_rules_block = False |
|
|
| for line in content.split("\n"): |
| stripped = line.strip() |
|
|
| |
| if stripped.startswith("### "): |
| if current_section: |
| self.sections.append(current_section) |
| title = stripped[4:].strip() |
| |
| keywords = re.findall(r"[「「](.+?)[」」]", title) |
| current_section = RouteSection( |
| title=title, |
| keywords=keywords, |
| ) |
| continue |
|
|
| |
| if current_section and ("→" in stripped or "->" in stripped): |
| |
| paths = re.findall(r"`([^`]+)`", stripped) |
| for path in paths: |
| if "/" in path or path.endswith((".py", ".md", ".toml", ".json")): |
| current_section.references.append(path) |
| |
| desc = re.sub(r"`[^`]+`", "", stripped).strip("→-> ").strip() |
| if desc: |
| current_section.description += desc + " " |
| continue |
|
|
| |
| if not current_section and stripped.startswith("- "): |
| self.global_rules.append(stripped[2:]) |
|
|
| |
| if current_section: |
| self.sections.append(current_section) |
|
|
| def query_context(self, task: str, top_k: int = 5) -> list[DocReference]: |
| """ |
| 漸進披露的核心方法:根據任務描述匹配文件引用 |
| |
| 使用關鍵字重疊度進行匹配: |
| overlap = len(task_tokens ∩ section_keywords) / len(section_keywords) |
| |
| Args: |
| task: 使用者的任務描述 |
| top_k: 回傳最相關的前 N 個引用 |
| |
| Returns: |
| 按相關性排序的 DocReference 列表 |
| """ |
| if not self._loaded: |
| self.load() |
|
|
| task_tokens = set(self._tokenize(task)) |
| results: list[DocReference] = [] |
|
|
| for section in self.sections: |
| section_tokens = set() |
| for kw in section.keywords: |
| section_tokens.update(self._tokenize(kw)) |
| |
| section_tokens.update(self._tokenize(section.title)) |
|
|
| if not section_tokens: |
| continue |
|
|
| |
| overlap = len(task_tokens & section_tokens) |
| relevance = overlap / max(len(section_tokens), 1) |
|
|
| if relevance > 0: |
| for ref_path in section.references: |
| results.append(DocReference( |
| path=ref_path, |
| section=section.title, |
| description=section.description.strip(), |
| relevance=relevance, |
| )) |
|
|
| |
| results.sort(key=lambda r: r.relevance, reverse=True) |
| return results[:top_k] |
|
|
| @staticmethod |
| def _tokenize(text: str) -> list[str]: |
| """ |
| 簡易中英文分詞 |
| |
| 英文:按空白和標點拆分,轉小寫 |
| 中文:逐字拆分(每個漢字作為獨立 token) |
| """ |
| tokens = [] |
| |
| english_tokens = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*", text) |
| tokens.extend(t.lower() for t in english_tokens) |
| |
| chinese_chars = re.findall(r"[\u4e00-\u9fff]", text) |
| tokens.extend(chinese_chars) |
| return tokens |
|
|