from __future__ import annotations import argparse import posixpath import re from dataclasses import dataclass from pathlib import Path, PurePosixPath TITLE_RE = re.compile(r"^#\s+(.+)$", re.MULTILINE) FENCED_BLOCK_RE = re.compile( r"(^```.*?$.*?^```$|^~~~.*?$.*?^~~~$)", re.MULTILINE | re.DOTALL, ) INLINE_CODE_RE = re.compile(r"(`[^`]*`)") MANIFEST_NAME = ".astrbot-wiki-sync-manifest" SOURCE_ALIASES = { "zh/config/providers/start.md": "zh/providers/start.md", "en/config/providers/start.md": "en/providers/start.md", } LANG_CONFIG = { "zh": { "index_title": "# AstrBot 中文文档", "index_intro": "该页面由 `AstrBot-docs` 自动同步到 GitHub Wiki。", "index_links": [ ("关于 AstrBot", "zh-what-is-astrbot"), ("社区", "zh-community"), ("常见问题", "zh-faq"), ], "home_intro": "该 Wiki 由 `AstrBot-docs` 自动同步生成。", "home_links": [ ("中文文档入口", "zh-index"), ("English Docs", "Home-en"), ], "sidebar_language_label": "Chinese", "sidebar_home_label": "首页", "sidebar_home_target": "Home", "sidebar_docs_entry_label": "文档入口", }, "en": { "index_title": "# AstrBot English Documentation", "index_intro": "This page is synchronized automatically from `AstrBot-docs` to the GitHub wiki.", "index_links": [ ("What is AstrBot", "en-what-is-astrbot"), ("Community", "en-community"), ("FAQ", "en-faq"), ], "home_intro": "This wiki is synchronized automatically from `AstrBot-docs`.", "home_links": [ ("English docs entry", "en-index"), ("中文文档入口", "Home"), ], "sidebar_language_label": "English", "sidebar_home_label": "Home", "sidebar_home_target": "Home-en", "sidebar_docs_entry_label": "Docs Entry", }, } @dataclass class PageInfo: source_path: str page_name: str title: str content: str language: str group: str is_index: bool @dataclass class ResolutionResult: resolved_path: str | None ambiguous_matches: tuple[str, ...] = () @dataclass class MarkdownLink: start: int end: int prefix: str target: str suffix: str @dataclass class Segment: kind: str text: str def repo_root() -> Path: return Path(__file__).resolve().parents[1] def discover_source_pages(source_root: str) -> tuple[str, ...]: root = Path(source_root) pages = [] for language in ("zh", "en"): language_root = root / language if not language_root.exists(): continue for path in language_root.rglob("*.md"): pages.append(path.relative_to(root).as_posix()) return tuple(sorted(pages)) def find_label_end(content: str, label_start: int) -> int: index = label_start + 1 while index < len(content): close = content.find("]", index) if close == -1: return -1 if close > label_start and content[close - 1] == "\\": index = close + 1 continue lookahead = close + 1 while lookahead < len(content) and content[lookahead].isspace(): lookahead += 1 if lookahead < len(content) and content[lookahead] == "(": return close index = close + 1 return -1 def find_target_end(content: str, target_start: int) -> int: depth = 0 index = target_start while index < len(content): character = content[index] if character == "\\": index += 2 continue if character == "(": depth += 1 elif character == ")": if depth == 0: return index depth -= 1 index += 1 return -1 def iter_markdown_links(content: str): """Yield inline Markdown links only. This scanner intentionally handles inline `[]()` links used in the docs tree. It does not parse reference-style links or arbitrary HTML. """ index = 0 while index < len(content): label_start = content.find("[", index) if label_start == -1: break link_start = ( label_start - 1 if label_start > 0 and content[label_start - 1] == "!" else label_start ) label_end = find_label_end(content, label_start) if label_end == -1: index = label_start + 1 continue target_start = label_end + 1 while target_start < len(content) and content[target_start].isspace(): target_start += 1 if target_start >= len(content) or content[target_start] != "(": index = label_end + 1 continue target_start += 1 target_end = find_target_end(content, target_start) if target_end == -1: index = label_end + 1 continue yield MarkdownLink( start=link_start, end=target_end + 1, prefix=content[link_start:target_start], target=content[target_start:target_end], suffix=")", ) index = target_end + 1 def split_anchor(target: str) -> tuple[str, str]: if "#" not in target: return target, "" base, anchor = target.split("#", 1) return base, f"#{anchor}" def prepare_candidate_path(path: PurePosixPath) -> PurePosixPath: if not path.suffix: path = path.with_suffix(".md") normalized = PurePosixPath(posixpath.normpath(path.as_posix())) normalized_text = normalized.as_posix() aliased = SOURCE_ALIASES.get(normalized_text, normalized_text) return PurePosixPath(aliased) def language_for_source(source_path: str) -> str: return PurePosixPath(source_path).parts[0] def parse_doc_target(target: str) -> tuple[str, str] | None: if target.startswith(("http://", "https://", "mailto:", "#")): return None base_target, anchor = split_anchor(target) if not base_target: return None suffix = PurePosixPath(base_target).suffix.lower() if suffix and suffix != ".md": return None return base_target, anchor def find_existing_source_path( candidate: PurePosixPath, source_root: Path, source_pages: tuple[str, ...], ) -> ResolutionResult: candidate_text = candidate.as_posix() if (source_root / candidate_text).exists(): return ResolutionResult(resolved_path=candidate_text) language = candidate.parts[0] if candidate.parts else "" suffix = ( PurePosixPath(*candidate.parts[1:]).as_posix() if len(candidate.parts) > 1 else "" ) if not suffix: return ResolutionResult(resolved_path=None) prefix = f"{language}/" full_suffix = f"{language}/{suffix}" matches = [ page for page in source_pages if page.startswith(prefix) and (page == full_suffix or page.endswith(f"/{suffix}")) ] if len(matches) == 1: return ResolutionResult(resolved_path=matches[0]) if len(matches) > 1: return ResolutionResult( resolved_path=None, ambiguous_matches=tuple(sorted(matches)), ) return ResolutionResult(resolved_path=None) def resolve_link_path( base_target: str, source_path: str, source_root: Path, source_pages: tuple[str, ...], ) -> ResolutionResult: source_language = language_for_source(source_path) if base_target.startswith("/"): target = base_target.lstrip("/") if not target: candidate = PurePosixPath(source_language) / "index.md" elif target in {"en", "en/"}: candidate = PurePosixPath("en") / "index.md" elif target in {"zh", "zh/"}: candidate = PurePosixPath("zh") / "index.md" elif target.startswith(("en/", "zh/")): candidate = PurePosixPath(target) else: language_root = source_language if source_language == "en" else "zh" candidate = PurePosixPath(language_root) / target else: candidate = PurePosixPath(source_path).parent / base_target candidate = prepare_candidate_path(candidate) return find_existing_source_path(candidate, source_root, source_pages) class LinkResolver: def __init__(self, source_root: Path): self.source_root = Path(source_root) self.source_pages = discover_source_pages(str(self.source_root)) def resolve_base_target( self, base_target: str, source_path: str ) -> ResolutionResult: return resolve_link_path( base_target=base_target, source_path=source_path, source_root=self.source_root, source_pages=self.source_pages, ) def resolve_markdown_target( self, target: str, source_path: str ) -> tuple[str | None, str]: parsed_target = parse_doc_target(target) if parsed_target is None: return None, "" base_target, anchor = parsed_target result = self.resolve_base_target(base_target, source_path) return result.resolved_path, anchor def rewrite_link_target(target: str, source_path: str, resolver: LinkResolver) -> str: resolved, anchor = resolver.resolve_markdown_target(target, source_path) if resolved is None: return target return f"{page_name_for_source(resolved)}{anchor}" def rewrite_links_in_segment( segment: str, source_path: str, resolver: LinkResolver, ) -> str: links = list(iter_markdown_links(segment)) if not links: return segment result: list[str] = [] previous_end = 0 for link in links: result.append(segment[previous_end : link.start]) result.append( f"{link.prefix}{rewrite_link_target(link.target, source_path, resolver)}{link.suffix}", ) previous_end = link.end result.append(segment[previous_end:]) return "".join(result) def iter_segments(content: str): last_end = 0 for fenced in FENCED_BLOCK_RE.finditer(content): before = content[last_end : fenced.start()] if before: last_inline_end = 0 for inline in INLINE_CODE_RE.finditer(before): if inline.start() > last_inline_end: yield Segment("text", before[last_inline_end : inline.start()]) yield Segment("inline_code", inline.group(0)) last_inline_end = inline.end() if last_inline_end < len(before): yield Segment("text", before[last_inline_end:]) yield Segment("code_block", fenced.group(0)) last_end = fenced.end() tail = content[last_end:] if not tail: return last_inline_end = 0 for inline in INLINE_CODE_RE.finditer(tail): if inline.start() > last_inline_end: yield Segment("text", tail[last_inline_end : inline.start()]) yield Segment("inline_code", inline.group(0)) last_inline_end = inline.end() if last_inline_end < len(tail): yield Segment("text", tail[last_inline_end:]) def rewrite_links( content: str, source_path: str, resolver: LinkResolver, ) -> str: output: list[str] = [] for segment in iter_segments(content): if segment.kind == "text": output.append( rewrite_links_in_segment( segment.text, source_path=source_path, resolver=resolver, ) ) continue output.append(segment.text) return "".join(output) def find_unresolved_doc_links(source_root: Path) -> list[str]: unresolved: list[str] = [] root = Path(source_root) resolver = LinkResolver(root) for source_path in resolver.source_pages: content = (root / source_path).read_text(encoding="utf-8") for link in iter_markdown_links(content): resolved_path, _ = resolver.resolve_markdown_target( link.target, source_path ) if resolved_path is not None: continue parsed_target = parse_doc_target(link.target) if parsed_target is None: continue base_target, _ = parsed_target resolution = resolver.resolve_base_target(base_target, source_path) if resolution.ambiguous_matches: unresolved.append( f"{source_path} -> {link.target} (ambiguous: {', '.join(resolution.ambiguous_matches)})", ) continue unresolved.append(f"{source_path} -> {link.target}") return unresolved def check_unresolved_doc_links(source_root: Path) -> None: unresolved = find_unresolved_doc_links(source_root) if not unresolved: return issues = "\n".join(f"- {item}" for item in unresolved) raise ValueError(f"Unresolved internal doc links found:\n{issues}") def page_name_for_source(source_path: str) -> str: if not source_path.endswith(".md"): raise ValueError(f"Unsupported source path: {source_path}") return source_path[:-3].replace("/", "-") def strip_frontmatter(content: str) -> str: if not content.startswith("---\n"): return content closing = content.find("\n---\n", 4) if closing == -1: return content return content[closing + 5 :].lstrip("\n") def normalize_content(content: str) -> str: stripped = content.rstrip() if not stripped: return "" return f"{stripped}\n" def default_title_for_source(source_path: str) -> str: stem = PurePosixPath(source_path).stem return stem.replace("-", " ") def extract_title(content: str, source_path: str) -> str: match = TITLE_RE.search(content) if match: return match.group(1).strip() return default_title_for_source(source_path) def build_language_index(language: str, page_names: set[str]) -> str: config = LANG_CONFIG[language] lines = [config["index_title"], "", config["index_intro"], ""] for label, page_name in config["index_links"]: if page_name in page_names: lines.append(f"- [{label}]({page_name})") return normalize_content("\n".join(lines)) def build_home_page(language: str) -> str: config = LANG_CONFIG[language] lines = ["# AstrBot Wiki", "", config["home_intro"], ""] for label, target in config["home_links"]: lines.append(f"- [{label}]({target})") return normalize_content("\n".join(lines)) def build_sidebar(page_infos: list[PageInfo]) -> str: lines: list[str] = [] for language in ("zh", "en"): config = LANG_CONFIG[language] infos = [ info for info in page_infos if info.language == language and not info.is_index ] infos.sort(key=lambda info: info.source_path) lines.append(f"### {config['sidebar_language_label']}") lines.append("") lines.append( f"- [{config['sidebar_home_label']}]({config['sidebar_home_target']})", ) lines.append( f"- [{config['sidebar_docs_entry_label']}]({language}-index)", ) grouped: dict[str, list[PageInfo]] = {} for info in infos: grouped.setdefault(info.group, []).append(info) for group_name in sorted(grouped): lines.append(f"- {group_name}") for info in grouped[group_name]: lines.append(f" - [{info.title}]({info.page_name})") lines.append("") return normalize_content("\n".join(lines)) def build_page_info( source_root: Path, source_path: str, resolver: LinkResolver ) -> PageInfo: source_file = source_root / source_path content = source_file.read_text(encoding="utf-8") content = strip_frontmatter(content) content = rewrite_links(content, source_path=source_path, resolver=resolver) content = normalize_content(content) relative = PurePosixPath(source_path) parts = relative.parts group = "Top Level" if len(parts) <= 2 else parts[1].replace("-", " ") return PageInfo( source_path=source_path, page_name=page_name_for_source(source_path), title=extract_title(content, source_path), content=content, language=language_for_source(source_path), group=group, is_index=relative.name == "index.md", ) def read_manifest(wiki_root: Path) -> set[str]: manifest_path = wiki_root / MANIFEST_NAME if not manifest_path.exists(): return set() return { line.strip() for line in manifest_path.read_text(encoding="utf-8").splitlines() if line.strip() } def write_manifest(wiki_root: Path, file_names: set[str]) -> None: manifest_path = wiki_root / MANIFEST_NAME content = "\n".join(sorted(file_names)) if content: content = f"{content}\n" manifest_path.write_text(content, encoding="utf-8") def write_file(path: Path, content: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(content, encoding="utf-8") def sync_docs_to_wiki(source_root: Path, wiki_root: Path) -> None: source_root = Path(source_root) wiki_root = Path(wiki_root) wiki_root.mkdir(parents=True, exist_ok=True) resolver = LinkResolver(source_root) page_infos = [ build_page_info(source_root, source_path, resolver) for source_path in resolver.source_pages ] page_names = {info.page_name for info in page_infos} for info in page_infos: if info.is_index and not info.content.strip(): generated = build_language_index(info.language, page_names) info.content = generated info.title = extract_title(generated, info.source_path) desired_files = {f"{info.page_name}.md": info.content for info in page_infos} desired_files["Home.md"] = build_home_page("zh") desired_files["Home-en.md"] = build_home_page("en") desired_files["_Sidebar.md"] = build_sidebar(page_infos) previously_managed = read_manifest(wiki_root) for existing_name in previously_managed - set(desired_files): existing_path = wiki_root / existing_name if existing_path.exists(): existing_path.unlink() for file_name, content in desired_files.items(): write_file(wiki_root / file_name, content) managed_files = set(desired_files) write_manifest(wiki_root, managed_files) def main() -> int: parser = argparse.ArgumentParser( description="Sync AstrBot docs content to GitHub wiki pages." ) parser.add_argument( "--source-root", default=str(repo_root()), help="Path to the AstrBot-docs repository root.", ) parser.add_argument( "--wiki-root", help="Path to the checked out wiki repository.", ) parser.add_argument( "--check-links-only", action="store_true", help="Validate internal doc links without writing wiki files.", ) args = parser.parse_args() if not args.check_links_only and not args.wiki_root: parser.error("--wiki-root is required unless --check-links-only is set") check_unresolved_doc_links(Path(args.source_root)) if args.check_links_only: return 0 sync_docs_to_wiki( source_root=Path(args.source_root), wiki_root=Path(args.wiki_root) ) return 0 if __name__ == "__main__": raise SystemExit(main())