Spaces:

Be2Jay
/

CTX

Running

App Files Files Community

Be2Jay commited on 27 days ago

Commit

31190eb

verified ·

1 Parent(s): 2bee1bf

feat: add Implementation tab with syntax-highlighted core algorithms

Browse files

Files changed (1) hide show

app.py +139 -1

app.py CHANGED Viewed

@@ -856,7 +856,145 @@ Query Input
             )
             gr.HTML('</div>')
-        # ── Tab 4: Citation ───────────────────────
         with gr.Tab("📚 Citation"):
             gr.HTML('<div class="section-header">BibTeX</div>')
             gr.Textbox(value=BIBTEX, label="BibTeX", lines=10, max_lines=15)

             )
             gr.HTML('</div>')
+        # ── Tab 4: Implementation ─────────────────
+        with gr.Tab("🔬 Implementation"):
+            gr.HTML('<div class="section-header">Core Algorithm — Trigger Classifier</div>')
+            gr.Code(value='''def classify_trigger(query: str) -> str:
+    """
+    Classify a developer query into one of four trigger types.
+    Uses regex pattern matching — lightweight, no model required.
+    Returns: EXPLICIT_SYMBOL | SEMANTIC_CONCEPT | TEMPORAL_HISTORY | IMPLICIT_CONTEXT
+    """
+    PATTERNS = {
+        "EXPLICIT_SYMBOL": [
+            r"\\b(function|class|method|def|variable|`[^`]+`)\\b",
+            r"\\bwhat does\\s+\\w+\\s+do\\b",
+            r"\\b[A-Z][a-zA-Z]+\\.[a-zA-Z]+\\b",       # e.g. Pipeline.run
+        ],
+        "IMPLICIT_CONTEXT": [
+            r"\\b(import|depend|module|used by|calls|related to)\\b",
+            r"\\b(dependency|transitive|downstream|upstream)\\b",
+        ],
+        "TEMPORAL_HISTORY": [
+            r"\\b(recent|latest|last|changed|modified|history)\\b",
+        ],
+        "SEMANTIC_CONCEPT": [
+            r"\\b(pipeline|architecture|design|flow|how|overview)\\b",
+        ],
+    }
+    scores = defaultdict(int)
+    q = query.lower()
+    for trigger, patterns in PATTERNS.items():
+        for pat in patterns:
+            if re.search(pat, q):
+                scores[trigger] += 1
+    return max(scores, key=scores.get) if scores else "SEMANTIC_CONCEPT"''', language="python", label="trigger_classifier.py")
+            gr.HTML('<div class="section-header">Import Graph BFS — Key Differentiator vs RAG</div>')
+            gr.Code(value='''def build_import_graph(root_dir: str) -> nx.DiGraph:
+    """
+    Parse Python AST to extract import relationships.
+    Creates a directed graph: file → imported_file edges.
+    O(N) in codebase size — runs in <1s on 1000-file codebases.
+    """
+    G = nx.DiGraph()
+    for path in Path(root_dir).rglob("*.py"):
+        if any(skip in path.parts for skip in ["venv", "__pycache__", ".git"]):
+            continue
+        try:
+            tree = ast.parse(path.read_text(encoding="utf-8", errors="ignore"))
+            for node in ast.walk(tree):
+                if isinstance(node, ast.Import):
+                    for alias in node.names:
+                        dep = alias.name.split(".")[0]
+                        G.add_edge(str(path), dep)
+                elif isinstance(node, ast.ImportFrom) and node.module:
+                    G.add_edge(str(path), node.module.split(".")[0])
+        except SyntaxError:
+            pass
+    return G
+def bfs_expand(seed_files: list[str], graph: nx.DiGraph, max_hops: int = 2) -> list[str]:
+    """
+    BFS traversal over import graph from seed files.
+    Resolves TRANSITIVE dependencies invisible to BM25/embedding methods.
+    Example: query "what does the evaluator use?"
+      seed: [evaluator.py]
+      hop1: [metrics.py, downstream_quality.py]    ← direct imports
+      hop2: [scipy, numpy, ...]                     ← transitive imports
+    This is what gives CTX perfect Recall@5=1.0 on IMPLICIT_CONTEXT queries.
+    """
+    visited = set(seed_files)
+    frontier = list(seed_files)
+    for _ in range(max_hops):
+        next_frontier = []
+        for f in frontier:
+            for neighbor in graph.successors(f):
+                if neighbor not in visited:
+                    visited.add(neighbor)
+                    next_frontier.append(neighbor)
+        frontier = next_frontier
+    return list(visited)''', language="python", label="adaptive_trigger.py — BFS Import Graph")
+            gr.HTML('<div class="section-header">TES Metric — Trade-off Efficiency Score</div>')
+            gr.Code(value='''def tes(recall_at_k: float, n_retrieved: int) -> float:
+    """
+    Trade-off Efficiency Score: balances recall against context size.
+    TES = Recall@K / ln(1 + |retrieved|)
+    Intuition: diminishing returns of loading more files.
+    - Loading 1 file: ln(2) = 0.693 penalty
+    - Loading 10 files: ln(11) = 2.398 penalty
+    - Loading ALL files: ln(1001) = 6.909 penalty  ← Full Context collapses here
+    Results:
+        Strategy         Recall@5    Token%    TES
+        ──────────────── ──────────  ──────    ─────
+        Full Context       0.075     100.0%    0.019  ← bad: high penalty
+        BM25               0.982      18.7%    0.410
+        CTX (Ours)         0.874       5.2%    0.776  ← best: minimal files
+    Validated: Pearson r=0.87 with NDCG@5 (p<0.001, 28 strategy-dataset pairs)
+    """
+    if n_retrieved == 0:
+        return 0.0
+    return recall_at_k / math.log(1 + n_retrieved)
+def adaptive_k(trigger_type: str, codebase_size: int) -> int:
+    """
+    Adaptive retrieval budget based on trigger type.
+    Symbol lookups need few files; dependency queries need more hops.
+    """
+    base = {
+        "EXPLICIT_SYMBOL":  3,   # exact match → few files
+        "TEMPORAL_HISTORY": 3,   # recent changes → few files
+        "SEMANTIC_CONCEPT": 5,   # concept → moderate
+        "IMPLICIT_CONTEXT": 7,   # graph traversal → more files
+    }
+    k = base.get(trigger_type, 5)
+    # Scale with codebase size (log scale)
+    if codebase_size > 500:
+        k = min(k + 2, 10)
+    return k''', language="python", label="metrics.py + adaptive_k")
+            gr.Markdown("""
+---
+**Full source**: [github.com/jaytoone/CTX](https://github.com/jaytoone/CTX)
+```bash
+git clone https://github.com/jaytoone/CTX && cd CTX
+pip install -r requirements.txt
+python run_experiment.py --dataset-size small --strategy all
+```
+""")
+        # ── Tab 5: Citation ───────────────────────
         with gr.Tab("📚 Citation"):
             gr.HTML('<div class="section-header">BibTeX</div>')
             gr.Textbox(value=BIBTEX, label="BibTeX", lines=10, max_lines=15)