Spaces:

qa1145
/

astrbot_help

Sleeping

File size: 10,065 Bytes

d347708

"""

代码索引 - 提供高性能的代码搜索功能

"""

import re
from pathlib import Path
from typing import Dict, List, Set, Tuple
from dataclasses import dataclass, field
from collections import defaultdict
import time


@dataclass
class IndexEntry:
    """索引条目"""
    file_path: str
    line: int
    context: str  # 行内容上下文


class CodeIndex:
    """代码索引器 - 使用倒排索引加速搜索"""

    def __init__(self, root_dir: Path):
        self.root_dir = root_dir
        self._keyword_index: Dict[str, List[IndexEntry]] = defaultdict(list)
        self._symbol_index: Dict[str, List[IndexEntry]] = defaultdict(list)
        self._indexed_files: Set[str] = set()
        self._last_build_time = 0

    def build_index(self, extensions: str = "*") -> Dict:
        """构建或重建索引



        Args:

            extensions: 要索引的文件扩展名，"*" 表示全部



        Returns:

            构建统计信息

        """
        import os

        start_time = time.time()
        self._keyword_index.clear()
        self._symbol_index.clear()
        self._indexed_files.clear()

        ext_list = extensions.split(',') if extensions != "*" else None

        files_processed = 0
        files_skipped = 0

        # 使用 os.walk 而不是 Path.rglob 来避免符号链接循环
        # followlinks=False 表示不跟随符号链接
        for dirpath, dirnames, filenames in os.walk(self.root_dir, followlinks=False):
            for filename in filenames:
                file_path = Path(dirpath) / filename

                # 跳过符号链接文件
                if file_path.is_symlink():
                    files_skipped += 1
                    continue

                if not file_path.is_file():
                    continue

                # 检查扩展名
                if ext_list and file_path.suffix.lstrip('.') not in ext_list:
                    continue

                try:
                    rel_path = str(file_path.relative_to(self.root_dir))
                    self._index_file(file_path, rel_path)
                    files_processed += 1
                except (ValueError, OSError):
                    # 处理无法计算相对路径的情况（符号链接到外部）
                    files_skipped += 1
                    continue

        build_time = time.time() - start_time
        self._last_build_time = time.time()

        stats = {
            "files_processed": files_processed,
            "files_skipped": files_skipped,
            "keyword_entries": sum(len(entries) for entries in self._keyword_index.values()),
            "symbol_entries": sum(len(entries) for entries in self._symbol_index.values()),
            "unique_keywords": len(self._keyword_index),
            "unique_symbols": len(self._symbol_index),
            "build_time_seconds": round(build_time, 2)
        }

        return stats

    def _index_file(self, file_path: Path, rel_path: str):
        """索引单个文件"""
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()

            lines = content.split('\n')

            # 提取符号（函数、类、变量定义）
            # Python: def foo(), class Bar:, VAR = value
            # JavaScript: function foo(), class Bar {}, const x = ...
            symbol_patterns = [
                # Python
                r'\bdef\s+(\w+)\s*\(',
                r'\bclass\s+(\w+)\s*[:\(]',
                r'^\s*(\w+)\s*=\s*[\'"\w\[]',
                # JavaScript/TypeScript
                r'\bfunction\s+(\w+)\s*\(',
                r'\bclass\s+(\w+)\s*\{',
                r'\bconst\s+(\w+)\s*=',
                r'\blet\s+(\w+)\s*=',
                r'\bvar\s+(\w+)\s*=',
            ]

            for i, line in enumerate(lines, 1):
                # 索引符号
                for pattern in symbol_patterns:
                    matches = re.finditer(pattern, line)
                    for match in matches:
                        symbol = match.group(1)
                        self._symbol_index[symbol].append(IndexEntry(
                            file_path=rel_path,
                            line=i,
                            context=line.strip()[:100]
                        ))

                # 索引关键词
                # 分词为单词（标识符）和保留部分特殊字符
                words = self._tokenize(line)
                for word in words:
                    if len(word) >= 2:  # 忽略单个字符
                        self._keyword_index[word].append(IndexEntry(
                            file_path=rel_path,
                            line=i,
                            context=line.strip()[:100]
                        ))

            self._indexed_files.add(rel_path)

        except Exception:
            pass

    def _tokenize(self, text: str) -> List[str]:
        """将文本分词为关键词



        返回标识符、数字和保留特定连接符的词

        """
        # 匹配标识符、数字和一些常见的组合词
        # 包括：snake_case, camelCase, PascalCase, 数字, 单词
        tokens = []

        # 拆分 camelCase 和 PascalCase
        # 例如: MyFunction -> My Function
        def split_camel_case(s):
            # 在大写字母前插入空格（连续大写作为整体）
            s1 = re.sub(r'(.)([A-Z][a-z]+)', r'\1 \2', s)
            return re.sub(r'([a-z0-9])([A-Z])', r'\1 \2', s1)

        # 提取标识符和数字
        matches = re.finditer(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b|\b\d+\b', text)
        for match in matches:
            word = match.group()

            # 拆分驼峰命名
            subwords = split_camel_case(word).lower().split()
            tokens.extend(subwords)

        return tokens

    def search_keywords(self, keyword: str, max_results: int = 20) -> List[Dict]:
        """搜索关键词（使用索引）



        Args:

            keyword: 搜索关键词

            max_results: 最大结果数



        Returns:

            匹配结果列表

        """
        # 如果没有构建索引，返回空
        if not self._keyword_index:
            return []

        # 分词并搜索
        keywords = self._tokenize(keyword)
        if not keywords:
            return []

        # 收集所有匹配的条目
        all_entries: List[Tuple[int, IndexEntry]] = []

        for kw in keywords:
            if kw in self._keyword_index:
                for entry in self._keyword_index[kw]:
                    # 计算匹配得分（关键词数量）
                    score = sum(1 for k in keywords if k in entry.context.lower())
                    all_entries.append((score, entry))

        # 按得分排序
        all_entries.sort(key=lambda x: x[0], reverse=True)

        # 去重（同一行只返回一次）
        seen: Set[Tuple[str, int]] = set()
        results = []

        for score, entry in all_entries:
            key = (entry.file_path, entry.line)
            if key not in seen:
                seen.add(key)
                results.append({
                    "file": entry.file_path,
                    "line": entry.line,
                    "content": entry.context,
                    "score": score
                })
                if len(results) >= max_results:
                    break

        return results

    def search_symbols(self, symbol: str, max_results: int = 20) -> List[Dict]:
        """搜索符号定义（函数、类、变量）



        Args:

            symbol: 符号名称

            max_results: 最大结果数



        Returns:

            匹配的符号定义

        """
        if not self._symbol_index:
            return []

        results = []
        seen: Set[Tuple[str, int]] = set()

        # 精确匹配
        if symbol in self._symbol_index:
            for entry in self._symbol_index[symbol]:
                key = (entry.file_path, entry.line)
                if key not in seen:
                    seen.add(key)
                    results.append({
                        "file": entry.file_path,
                        "line": entry.line,
                        "content": entry.context,
                        "type": "definition"
                    })
                    if len(results) >= max_results:
                        return results

        # 模糊匹配（包含）
        for sym, entries in self._symbol_index.items():
            if symbol.lower() in sym.lower():
                for entry in entries:
                    key = (entry.file_path, entry.line)
                    if key not in seen:
                        seen.add(key)
                        results.append({
                            "file": entry.file_path,
                            "line": entry.line,
                            "content": entry.context,
                            "type": "definition"
                        })
                        if len(results) >= max_results:
                            return results

        return results

    def get_stats(self) -> Dict:
        """获取索引统计信息"""
        return {
            "indexed_files": len(self._indexed_files),
            "keyword_entries": sum(len(entries) for entries in self._keyword_index.values()),
            "symbol_entries": sum(len(entries) for entries in self._symbol_index.values()),
            "unique_keywords": len(self._keyword_index),
            "unique_symbols": len(self._symbol_index),
            "last_build_time": self._last_build_time
        }

    def is_built(self) -> bool:
        """检查索引是否已构建"""
        return len(self._indexed_files) > 0