astrbbbb / docs /scripts /sync_docs_to_wiki.py
qa1145's picture
Upload 1245 files
8ede856 verified
from __future__ import annotations
import argparse
import posixpath
import re
from dataclasses import dataclass
from pathlib import Path, PurePosixPath
TITLE_RE = re.compile(r"^#\s+(.+)$", re.MULTILINE)
FENCED_BLOCK_RE = re.compile(
r"(^```.*?$.*?^```$|^~~~.*?$.*?^~~~$)",
re.MULTILINE | re.DOTALL,
)
INLINE_CODE_RE = re.compile(r"(`[^`]*`)")
MANIFEST_NAME = ".astrbot-wiki-sync-manifest"
SOURCE_ALIASES = {
"zh/config/providers/start.md": "zh/providers/start.md",
"en/config/providers/start.md": "en/providers/start.md",
}
LANG_CONFIG = {
"zh": {
"index_title": "# AstrBot 中文文档",
"index_intro": "该页面由 `AstrBot-docs` 自动同步到 GitHub Wiki。",
"index_links": [
("关于 AstrBot", "zh-what-is-astrbot"),
("社区", "zh-community"),
("常见问题", "zh-faq"),
],
"home_intro": "该 Wiki 由 `AstrBot-docs` 自动同步生成。",
"home_links": [
("中文文档入口", "zh-index"),
("English Docs", "Home-en"),
],
"sidebar_language_label": "Chinese",
"sidebar_home_label": "首页",
"sidebar_home_target": "Home",
"sidebar_docs_entry_label": "文档入口",
},
"en": {
"index_title": "# AstrBot English Documentation",
"index_intro": "This page is synchronized automatically from `AstrBot-docs` to the GitHub wiki.",
"index_links": [
("What is AstrBot", "en-what-is-astrbot"),
("Community", "en-community"),
("FAQ", "en-faq"),
],
"home_intro": "This wiki is synchronized automatically from `AstrBot-docs`.",
"home_links": [
("English docs entry", "en-index"),
("中文文档入口", "Home"),
],
"sidebar_language_label": "English",
"sidebar_home_label": "Home",
"sidebar_home_target": "Home-en",
"sidebar_docs_entry_label": "Docs Entry",
},
}
@dataclass
class PageInfo:
source_path: str
page_name: str
title: str
content: str
language: str
group: str
is_index: bool
@dataclass
class ResolutionResult:
resolved_path: str | None
ambiguous_matches: tuple[str, ...] = ()
@dataclass
class MarkdownLink:
start: int
end: int
prefix: str
target: str
suffix: str
@dataclass
class Segment:
kind: str
text: str
def repo_root() -> Path:
return Path(__file__).resolve().parents[1]
def discover_source_pages(source_root: str) -> tuple[str, ...]:
root = Path(source_root)
pages = []
for language in ("zh", "en"):
language_root = root / language
if not language_root.exists():
continue
for path in language_root.rglob("*.md"):
pages.append(path.relative_to(root).as_posix())
return tuple(sorted(pages))
def find_label_end(content: str, label_start: int) -> int:
index = label_start + 1
while index < len(content):
close = content.find("]", index)
if close == -1:
return -1
if close > label_start and content[close - 1] == "\\":
index = close + 1
continue
lookahead = close + 1
while lookahead < len(content) and content[lookahead].isspace():
lookahead += 1
if lookahead < len(content) and content[lookahead] == "(":
return close
index = close + 1
return -1
def find_target_end(content: str, target_start: int) -> int:
depth = 0
index = target_start
while index < len(content):
character = content[index]
if character == "\\":
index += 2
continue
if character == "(":
depth += 1
elif character == ")":
if depth == 0:
return index
depth -= 1
index += 1
return -1
def iter_markdown_links(content: str):
"""Yield inline Markdown links only.
This scanner intentionally handles inline `[]()` links used in the docs tree.
It does not parse reference-style links or arbitrary HTML.
"""
index = 0
while index < len(content):
label_start = content.find("[", index)
if label_start == -1:
break
link_start = (
label_start - 1
if label_start > 0 and content[label_start - 1] == "!"
else label_start
)
label_end = find_label_end(content, label_start)
if label_end == -1:
index = label_start + 1
continue
target_start = label_end + 1
while target_start < len(content) and content[target_start].isspace():
target_start += 1
if target_start >= len(content) or content[target_start] != "(":
index = label_end + 1
continue
target_start += 1
target_end = find_target_end(content, target_start)
if target_end == -1:
index = label_end + 1
continue
yield MarkdownLink(
start=link_start,
end=target_end + 1,
prefix=content[link_start:target_start],
target=content[target_start:target_end],
suffix=")",
)
index = target_end + 1
def split_anchor(target: str) -> tuple[str, str]:
if "#" not in target:
return target, ""
base, anchor = target.split("#", 1)
return base, f"#{anchor}"
def prepare_candidate_path(path: PurePosixPath) -> PurePosixPath:
if not path.suffix:
path = path.with_suffix(".md")
normalized = PurePosixPath(posixpath.normpath(path.as_posix()))
normalized_text = normalized.as_posix()
aliased = SOURCE_ALIASES.get(normalized_text, normalized_text)
return PurePosixPath(aliased)
def language_for_source(source_path: str) -> str:
return PurePosixPath(source_path).parts[0]
def parse_doc_target(target: str) -> tuple[str, str] | None:
if target.startswith(("http://", "https://", "mailto:", "#")):
return None
base_target, anchor = split_anchor(target)
if not base_target:
return None
suffix = PurePosixPath(base_target).suffix.lower()
if suffix and suffix != ".md":
return None
return base_target, anchor
def find_existing_source_path(
candidate: PurePosixPath,
source_root: Path,
source_pages: tuple[str, ...],
) -> ResolutionResult:
candidate_text = candidate.as_posix()
if (source_root / candidate_text).exists():
return ResolutionResult(resolved_path=candidate_text)
language = candidate.parts[0] if candidate.parts else ""
suffix = (
PurePosixPath(*candidate.parts[1:]).as_posix()
if len(candidate.parts) > 1
else ""
)
if not suffix:
return ResolutionResult(resolved_path=None)
prefix = f"{language}/"
full_suffix = f"{language}/{suffix}"
matches = [
page
for page in source_pages
if page.startswith(prefix)
and (page == full_suffix or page.endswith(f"/{suffix}"))
]
if len(matches) == 1:
return ResolutionResult(resolved_path=matches[0])
if len(matches) > 1:
return ResolutionResult(
resolved_path=None,
ambiguous_matches=tuple(sorted(matches)),
)
return ResolutionResult(resolved_path=None)
def resolve_link_path(
base_target: str,
source_path: str,
source_root: Path,
source_pages: tuple[str, ...],
) -> ResolutionResult:
source_language = language_for_source(source_path)
if base_target.startswith("/"):
target = base_target.lstrip("/")
if not target:
candidate = PurePosixPath(source_language) / "index.md"
elif target in {"en", "en/"}:
candidate = PurePosixPath("en") / "index.md"
elif target in {"zh", "zh/"}:
candidate = PurePosixPath("zh") / "index.md"
elif target.startswith(("en/", "zh/")):
candidate = PurePosixPath(target)
else:
language_root = source_language if source_language == "en" else "zh"
candidate = PurePosixPath(language_root) / target
else:
candidate = PurePosixPath(source_path).parent / base_target
candidate = prepare_candidate_path(candidate)
return find_existing_source_path(candidate, source_root, source_pages)
class LinkResolver:
def __init__(self, source_root: Path):
self.source_root = Path(source_root)
self.source_pages = discover_source_pages(str(self.source_root))
def resolve_base_target(
self, base_target: str, source_path: str
) -> ResolutionResult:
return resolve_link_path(
base_target=base_target,
source_path=source_path,
source_root=self.source_root,
source_pages=self.source_pages,
)
def resolve_markdown_target(
self, target: str, source_path: str
) -> tuple[str | None, str]:
parsed_target = parse_doc_target(target)
if parsed_target is None:
return None, ""
base_target, anchor = parsed_target
result = self.resolve_base_target(base_target, source_path)
return result.resolved_path, anchor
def rewrite_link_target(target: str, source_path: str, resolver: LinkResolver) -> str:
resolved, anchor = resolver.resolve_markdown_target(target, source_path)
if resolved is None:
return target
return f"{page_name_for_source(resolved)}{anchor}"
def rewrite_links_in_segment(
segment: str,
source_path: str,
resolver: LinkResolver,
) -> str:
links = list(iter_markdown_links(segment))
if not links:
return segment
result: list[str] = []
previous_end = 0
for link in links:
result.append(segment[previous_end : link.start])
result.append(
f"{link.prefix}{rewrite_link_target(link.target, source_path, resolver)}{link.suffix}",
)
previous_end = link.end
result.append(segment[previous_end:])
return "".join(result)
def iter_segments(content: str):
last_end = 0
for fenced in FENCED_BLOCK_RE.finditer(content):
before = content[last_end : fenced.start()]
if before:
last_inline_end = 0
for inline in INLINE_CODE_RE.finditer(before):
if inline.start() > last_inline_end:
yield Segment("text", before[last_inline_end : inline.start()])
yield Segment("inline_code", inline.group(0))
last_inline_end = inline.end()
if last_inline_end < len(before):
yield Segment("text", before[last_inline_end:])
yield Segment("code_block", fenced.group(0))
last_end = fenced.end()
tail = content[last_end:]
if not tail:
return
last_inline_end = 0
for inline in INLINE_CODE_RE.finditer(tail):
if inline.start() > last_inline_end:
yield Segment("text", tail[last_inline_end : inline.start()])
yield Segment("inline_code", inline.group(0))
last_inline_end = inline.end()
if last_inline_end < len(tail):
yield Segment("text", tail[last_inline_end:])
def rewrite_links(
content: str,
source_path: str,
resolver: LinkResolver,
) -> str:
output: list[str] = []
for segment in iter_segments(content):
if segment.kind == "text":
output.append(
rewrite_links_in_segment(
segment.text,
source_path=source_path,
resolver=resolver,
)
)
continue
output.append(segment.text)
return "".join(output)
def find_unresolved_doc_links(source_root: Path) -> list[str]:
unresolved: list[str] = []
root = Path(source_root)
resolver = LinkResolver(root)
for source_path in resolver.source_pages:
content = (root / source_path).read_text(encoding="utf-8")
for link in iter_markdown_links(content):
resolved_path, _ = resolver.resolve_markdown_target(
link.target, source_path
)
if resolved_path is not None:
continue
parsed_target = parse_doc_target(link.target)
if parsed_target is None:
continue
base_target, _ = parsed_target
resolution = resolver.resolve_base_target(base_target, source_path)
if resolution.ambiguous_matches:
unresolved.append(
f"{source_path} -> {link.target} (ambiguous: {', '.join(resolution.ambiguous_matches)})",
)
continue
unresolved.append(f"{source_path} -> {link.target}")
return unresolved
def check_unresolved_doc_links(source_root: Path) -> None:
unresolved = find_unresolved_doc_links(source_root)
if not unresolved:
return
issues = "\n".join(f"- {item}" for item in unresolved)
raise ValueError(f"Unresolved internal doc links found:\n{issues}")
def page_name_for_source(source_path: str) -> str:
if not source_path.endswith(".md"):
raise ValueError(f"Unsupported source path: {source_path}")
return source_path[:-3].replace("/", "-")
def strip_frontmatter(content: str) -> str:
if not content.startswith("---\n"):
return content
closing = content.find("\n---\n", 4)
if closing == -1:
return content
return content[closing + 5 :].lstrip("\n")
def normalize_content(content: str) -> str:
stripped = content.rstrip()
if not stripped:
return ""
return f"{stripped}\n"
def default_title_for_source(source_path: str) -> str:
stem = PurePosixPath(source_path).stem
return stem.replace("-", " ")
def extract_title(content: str, source_path: str) -> str:
match = TITLE_RE.search(content)
if match:
return match.group(1).strip()
return default_title_for_source(source_path)
def build_language_index(language: str, page_names: set[str]) -> str:
config = LANG_CONFIG[language]
lines = [config["index_title"], "", config["index_intro"], ""]
for label, page_name in config["index_links"]:
if page_name in page_names:
lines.append(f"- [{label}]({page_name})")
return normalize_content("\n".join(lines))
def build_home_page(language: str) -> str:
config = LANG_CONFIG[language]
lines = ["# AstrBot Wiki", "", config["home_intro"], ""]
for label, target in config["home_links"]:
lines.append(f"- [{label}]({target})")
return normalize_content("\n".join(lines))
def build_sidebar(page_infos: list[PageInfo]) -> str:
lines: list[str] = []
for language in ("zh", "en"):
config = LANG_CONFIG[language]
infos = [
info
for info in page_infos
if info.language == language and not info.is_index
]
infos.sort(key=lambda info: info.source_path)
lines.append(f"### {config['sidebar_language_label']}")
lines.append("")
lines.append(
f"- [{config['sidebar_home_label']}]({config['sidebar_home_target']})",
)
lines.append(
f"- [{config['sidebar_docs_entry_label']}]({language}-index)",
)
grouped: dict[str, list[PageInfo]] = {}
for info in infos:
grouped.setdefault(info.group, []).append(info)
for group_name in sorted(grouped):
lines.append(f"- {group_name}")
for info in grouped[group_name]:
lines.append(f" - [{info.title}]({info.page_name})")
lines.append("")
return normalize_content("\n".join(lines))
def build_page_info(
source_root: Path, source_path: str, resolver: LinkResolver
) -> PageInfo:
source_file = source_root / source_path
content = source_file.read_text(encoding="utf-8")
content = strip_frontmatter(content)
content = rewrite_links(content, source_path=source_path, resolver=resolver)
content = normalize_content(content)
relative = PurePosixPath(source_path)
parts = relative.parts
group = "Top Level" if len(parts) <= 2 else parts[1].replace("-", " ")
return PageInfo(
source_path=source_path,
page_name=page_name_for_source(source_path),
title=extract_title(content, source_path),
content=content,
language=language_for_source(source_path),
group=group,
is_index=relative.name == "index.md",
)
def read_manifest(wiki_root: Path) -> set[str]:
manifest_path = wiki_root / MANIFEST_NAME
if not manifest_path.exists():
return set()
return {
line.strip()
for line in manifest_path.read_text(encoding="utf-8").splitlines()
if line.strip()
}
def write_manifest(wiki_root: Path, file_names: set[str]) -> None:
manifest_path = wiki_root / MANIFEST_NAME
content = "\n".join(sorted(file_names))
if content:
content = f"{content}\n"
manifest_path.write_text(content, encoding="utf-8")
def write_file(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def sync_docs_to_wiki(source_root: Path, wiki_root: Path) -> None:
source_root = Path(source_root)
wiki_root = Path(wiki_root)
wiki_root.mkdir(parents=True, exist_ok=True)
resolver = LinkResolver(source_root)
page_infos = [
build_page_info(source_root, source_path, resolver)
for source_path in resolver.source_pages
]
page_names = {info.page_name for info in page_infos}
for info in page_infos:
if info.is_index and not info.content.strip():
generated = build_language_index(info.language, page_names)
info.content = generated
info.title = extract_title(generated, info.source_path)
desired_files = {f"{info.page_name}.md": info.content for info in page_infos}
desired_files["Home.md"] = build_home_page("zh")
desired_files["Home-en.md"] = build_home_page("en")
desired_files["_Sidebar.md"] = build_sidebar(page_infos)
previously_managed = read_manifest(wiki_root)
for existing_name in previously_managed - set(desired_files):
existing_path = wiki_root / existing_name
if existing_path.exists():
existing_path.unlink()
for file_name, content in desired_files.items():
write_file(wiki_root / file_name, content)
managed_files = set(desired_files)
write_manifest(wiki_root, managed_files)
def main() -> int:
parser = argparse.ArgumentParser(
description="Sync AstrBot docs content to GitHub wiki pages."
)
parser.add_argument(
"--source-root",
default=str(repo_root()),
help="Path to the AstrBot-docs repository root.",
)
parser.add_argument(
"--wiki-root",
help="Path to the checked out wiki repository.",
)
parser.add_argument(
"--check-links-only",
action="store_true",
help="Validate internal doc links without writing wiki files.",
)
args = parser.parse_args()
if not args.check_links_only and not args.wiki_root:
parser.error("--wiki-root is required unless --check-links-only is set")
check_unresolved_doc_links(Path(args.source_root))
if args.check_links_only:
return 0
sync_docs_to_wiki(
source_root=Path(args.source_root), wiki_root=Path(args.wiki_root)
)
return 0
if __name__ == "__main__":
raise SystemExit(main())