feat: add Implementation tab with syntax-highlighted core algorithms
Browse files
app.py
CHANGED
|
@@ -856,7 +856,145 @@ Query Input
|
|
| 856 |
)
|
| 857 |
gr.HTML('</div>')
|
| 858 |
|
| 859 |
-
# ββ Tab 4:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 860 |
with gr.Tab("π Citation"):
|
| 861 |
gr.HTML('<div class="section-header">BibTeX</div>')
|
| 862 |
gr.Textbox(value=BIBTEX, label="BibTeX", lines=10, max_lines=15)
|
|
|
|
| 856 |
)
|
| 857 |
gr.HTML('</div>')
|
| 858 |
|
| 859 |
+
# ββ Tab 4: Implementation βββββββββββββββββ
|
| 860 |
+
with gr.Tab("π¬ Implementation"):
|
| 861 |
+
gr.HTML('<div class="section-header">Core Algorithm β Trigger Classifier</div>')
|
| 862 |
+
gr.Code(value='''def classify_trigger(query: str) -> str:
|
| 863 |
+
"""
|
| 864 |
+
Classify a developer query into one of four trigger types.
|
| 865 |
+
Uses regex pattern matching β lightweight, no model required.
|
| 866 |
+
|
| 867 |
+
Returns: EXPLICIT_SYMBOL | SEMANTIC_CONCEPT | TEMPORAL_HISTORY | IMPLICIT_CONTEXT
|
| 868 |
+
"""
|
| 869 |
+
PATTERNS = {
|
| 870 |
+
"EXPLICIT_SYMBOL": [
|
| 871 |
+
r"\\b(function|class|method|def|variable|`[^`]+`)\\b",
|
| 872 |
+
r"\\bwhat does\\s+\\w+\\s+do\\b",
|
| 873 |
+
r"\\b[A-Z][a-zA-Z]+\\.[a-zA-Z]+\\b", # e.g. Pipeline.run
|
| 874 |
+
],
|
| 875 |
+
"IMPLICIT_CONTEXT": [
|
| 876 |
+
r"\\b(import|depend|module|used by|calls|related to)\\b",
|
| 877 |
+
r"\\b(dependency|transitive|downstream|upstream)\\b",
|
| 878 |
+
],
|
| 879 |
+
"TEMPORAL_HISTORY": [
|
| 880 |
+
r"\\b(recent|latest|last|changed|modified|history)\\b",
|
| 881 |
+
],
|
| 882 |
+
"SEMANTIC_CONCEPT": [
|
| 883 |
+
r"\\b(pipeline|architecture|design|flow|how|overview)\\b",
|
| 884 |
+
],
|
| 885 |
+
}
|
| 886 |
+
scores = defaultdict(int)
|
| 887 |
+
q = query.lower()
|
| 888 |
+
for trigger, patterns in PATTERNS.items():
|
| 889 |
+
for pat in patterns:
|
| 890 |
+
if re.search(pat, q):
|
| 891 |
+
scores[trigger] += 1
|
| 892 |
+
return max(scores, key=scores.get) if scores else "SEMANTIC_CONCEPT"''', language="python", label="trigger_classifier.py")
|
| 893 |
+
|
| 894 |
+
gr.HTML('<div class="section-header">Import Graph BFS β Key Differentiator vs RAG</div>')
|
| 895 |
+
gr.Code(value='''def build_import_graph(root_dir: str) -> nx.DiGraph:
|
| 896 |
+
"""
|
| 897 |
+
Parse Python AST to extract import relationships.
|
| 898 |
+
Creates a directed graph: file β imported_file edges.
|
| 899 |
+
O(N) in codebase size β runs in <1s on 1000-file codebases.
|
| 900 |
+
"""
|
| 901 |
+
G = nx.DiGraph()
|
| 902 |
+
for path in Path(root_dir).rglob("*.py"):
|
| 903 |
+
if any(skip in path.parts for skip in ["venv", "__pycache__", ".git"]):
|
| 904 |
+
continue
|
| 905 |
+
try:
|
| 906 |
+
tree = ast.parse(path.read_text(encoding="utf-8", errors="ignore"))
|
| 907 |
+
for node in ast.walk(tree):
|
| 908 |
+
if isinstance(node, ast.Import):
|
| 909 |
+
for alias in node.names:
|
| 910 |
+
dep = alias.name.split(".")[0]
|
| 911 |
+
G.add_edge(str(path), dep)
|
| 912 |
+
elif isinstance(node, ast.ImportFrom) and node.module:
|
| 913 |
+
G.add_edge(str(path), node.module.split(".")[0])
|
| 914 |
+
except SyntaxError:
|
| 915 |
+
pass
|
| 916 |
+
return G
|
| 917 |
+
|
| 918 |
+
|
| 919 |
+
def bfs_expand(seed_files: list[str], graph: nx.DiGraph, max_hops: int = 2) -> list[str]:
|
| 920 |
+
"""
|
| 921 |
+
BFS traversal over import graph from seed files.
|
| 922 |
+
Resolves TRANSITIVE dependencies invisible to BM25/embedding methods.
|
| 923 |
+
|
| 924 |
+
Example: query "what does the evaluator use?"
|
| 925 |
+
seed: [evaluator.py]
|
| 926 |
+
hop1: [metrics.py, downstream_quality.py] β direct imports
|
| 927 |
+
hop2: [scipy, numpy, ...] β transitive imports
|
| 928 |
+
|
| 929 |
+
This is what gives CTX perfect Recall@5=1.0 on IMPLICIT_CONTEXT queries.
|
| 930 |
+
"""
|
| 931 |
+
visited = set(seed_files)
|
| 932 |
+
frontier = list(seed_files)
|
| 933 |
+
for _ in range(max_hops):
|
| 934 |
+
next_frontier = []
|
| 935 |
+
for f in frontier:
|
| 936 |
+
for neighbor in graph.successors(f):
|
| 937 |
+
if neighbor not in visited:
|
| 938 |
+
visited.add(neighbor)
|
| 939 |
+
next_frontier.append(neighbor)
|
| 940 |
+
frontier = next_frontier
|
| 941 |
+
return list(visited)''', language="python", label="adaptive_trigger.py β BFS Import Graph")
|
| 942 |
+
|
| 943 |
+
gr.HTML('<div class="section-header">TES Metric β Trade-off Efficiency Score</div>')
|
| 944 |
+
gr.Code(value='''def tes(recall_at_k: float, n_retrieved: int) -> float:
|
| 945 |
+
"""
|
| 946 |
+
Trade-off Efficiency Score: balances recall against context size.
|
| 947 |
+
|
| 948 |
+
TES = Recall@K / ln(1 + |retrieved|)
|
| 949 |
+
|
| 950 |
+
Intuition: diminishing returns of loading more files.
|
| 951 |
+
- Loading 1 file: ln(2) = 0.693 penalty
|
| 952 |
+
- Loading 10 files: ln(11) = 2.398 penalty
|
| 953 |
+
- Loading ALL files: ln(1001) = 6.909 penalty β Full Context collapses here
|
| 954 |
+
|
| 955 |
+
Results:
|
| 956 |
+
Strategy Recall@5 Token% TES
|
| 957 |
+
ββββββββββββββββ ββββββββββ ββββββ βββββ
|
| 958 |
+
Full Context 0.075 100.0% 0.019 β bad: high penalty
|
| 959 |
+
BM25 0.982 18.7% 0.410
|
| 960 |
+
CTX (Ours) 0.874 5.2% 0.776 β best: minimal files
|
| 961 |
+
|
| 962 |
+
Validated: Pearson r=0.87 with NDCG@5 (p<0.001, 28 strategy-dataset pairs)
|
| 963 |
+
"""
|
| 964 |
+
if n_retrieved == 0:
|
| 965 |
+
return 0.0
|
| 966 |
+
return recall_at_k / math.log(1 + n_retrieved)
|
| 967 |
+
|
| 968 |
+
|
| 969 |
+
def adaptive_k(trigger_type: str, codebase_size: int) -> int:
|
| 970 |
+
"""
|
| 971 |
+
Adaptive retrieval budget based on trigger type.
|
| 972 |
+
Symbol lookups need few files; dependency queries need more hops.
|
| 973 |
+
"""
|
| 974 |
+
base = {
|
| 975 |
+
"EXPLICIT_SYMBOL": 3, # exact match β few files
|
| 976 |
+
"TEMPORAL_HISTORY": 3, # recent changes β few files
|
| 977 |
+
"SEMANTIC_CONCEPT": 5, # concept β moderate
|
| 978 |
+
"IMPLICIT_CONTEXT": 7, # graph traversal β more files
|
| 979 |
+
}
|
| 980 |
+
k = base.get(trigger_type, 5)
|
| 981 |
+
# Scale with codebase size (log scale)
|
| 982 |
+
if codebase_size > 500:
|
| 983 |
+
k = min(k + 2, 10)
|
| 984 |
+
return k''', language="python", label="metrics.py + adaptive_k")
|
| 985 |
+
|
| 986 |
+
gr.Markdown("""
|
| 987 |
+
---
|
| 988 |
+
**Full source**: [github.com/jaytoone/CTX](https://github.com/jaytoone/CTX)
|
| 989 |
+
|
| 990 |
+
```bash
|
| 991 |
+
git clone https://github.com/jaytoone/CTX && cd CTX
|
| 992 |
+
pip install -r requirements.txt
|
| 993 |
+
python run_experiment.py --dataset-size small --strategy all
|
| 994 |
+
```
|
| 995 |
+
""")
|
| 996 |
+
|
| 997 |
+
# ββ Tab 5: Citation βββββββββββββββββββββββ
|
| 998 |
with gr.Tab("π Citation"):
|
| 999 |
gr.HTML('<div class="section-header">BibTeX</div>')
|
| 1000 |
gr.Textbox(value=BIBTEX, label="BibTeX", lines=10, max_lines=15)
|