Be2Jay commited on
Commit
31190eb
Β·
verified Β·
1 Parent(s): 2bee1bf

feat: add Implementation tab with syntax-highlighted core algorithms

Browse files
Files changed (1) hide show
  1. app.py +139 -1
app.py CHANGED
@@ -856,7 +856,145 @@ Query Input
856
  )
857
  gr.HTML('</div>')
858
 
859
- # ── Tab 4: Citation ───────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
860
  with gr.Tab("πŸ“š Citation"):
861
  gr.HTML('<div class="section-header">BibTeX</div>')
862
  gr.Textbox(value=BIBTEX, label="BibTeX", lines=10, max_lines=15)
 
856
  )
857
  gr.HTML('</div>')
858
 
859
+ # ── Tab 4: Implementation ─────────────────
860
+ with gr.Tab("πŸ”¬ Implementation"):
861
+ gr.HTML('<div class="section-header">Core Algorithm β€” Trigger Classifier</div>')
862
+ gr.Code(value='''def classify_trigger(query: str) -> str:
863
+ """
864
+ Classify a developer query into one of four trigger types.
865
+ Uses regex pattern matching β€” lightweight, no model required.
866
+
867
+ Returns: EXPLICIT_SYMBOL | SEMANTIC_CONCEPT | TEMPORAL_HISTORY | IMPLICIT_CONTEXT
868
+ """
869
+ PATTERNS = {
870
+ "EXPLICIT_SYMBOL": [
871
+ r"\\b(function|class|method|def|variable|`[^`]+`)\\b",
872
+ r"\\bwhat does\\s+\\w+\\s+do\\b",
873
+ r"\\b[A-Z][a-zA-Z]+\\.[a-zA-Z]+\\b", # e.g. Pipeline.run
874
+ ],
875
+ "IMPLICIT_CONTEXT": [
876
+ r"\\b(import|depend|module|used by|calls|related to)\\b",
877
+ r"\\b(dependency|transitive|downstream|upstream)\\b",
878
+ ],
879
+ "TEMPORAL_HISTORY": [
880
+ r"\\b(recent|latest|last|changed|modified|history)\\b",
881
+ ],
882
+ "SEMANTIC_CONCEPT": [
883
+ r"\\b(pipeline|architecture|design|flow|how|overview)\\b",
884
+ ],
885
+ }
886
+ scores = defaultdict(int)
887
+ q = query.lower()
888
+ for trigger, patterns in PATTERNS.items():
889
+ for pat in patterns:
890
+ if re.search(pat, q):
891
+ scores[trigger] += 1
892
+ return max(scores, key=scores.get) if scores else "SEMANTIC_CONCEPT"''', language="python", label="trigger_classifier.py")
893
+
894
+ gr.HTML('<div class="section-header">Import Graph BFS β€” Key Differentiator vs RAG</div>')
895
+ gr.Code(value='''def build_import_graph(root_dir: str) -> nx.DiGraph:
896
+ """
897
+ Parse Python AST to extract import relationships.
898
+ Creates a directed graph: file β†’ imported_file edges.
899
+ O(N) in codebase size β€” runs in <1s on 1000-file codebases.
900
+ """
901
+ G = nx.DiGraph()
902
+ for path in Path(root_dir).rglob("*.py"):
903
+ if any(skip in path.parts for skip in ["venv", "__pycache__", ".git"]):
904
+ continue
905
+ try:
906
+ tree = ast.parse(path.read_text(encoding="utf-8", errors="ignore"))
907
+ for node in ast.walk(tree):
908
+ if isinstance(node, ast.Import):
909
+ for alias in node.names:
910
+ dep = alias.name.split(".")[0]
911
+ G.add_edge(str(path), dep)
912
+ elif isinstance(node, ast.ImportFrom) and node.module:
913
+ G.add_edge(str(path), node.module.split(".")[0])
914
+ except SyntaxError:
915
+ pass
916
+ return G
917
+
918
+
919
+ def bfs_expand(seed_files: list[str], graph: nx.DiGraph, max_hops: int = 2) -> list[str]:
920
+ """
921
+ BFS traversal over import graph from seed files.
922
+ Resolves TRANSITIVE dependencies invisible to BM25/embedding methods.
923
+
924
+ Example: query "what does the evaluator use?"
925
+ seed: [evaluator.py]
926
+ hop1: [metrics.py, downstream_quality.py] ← direct imports
927
+ hop2: [scipy, numpy, ...] ← transitive imports
928
+
929
+ This is what gives CTX perfect Recall@5=1.0 on IMPLICIT_CONTEXT queries.
930
+ """
931
+ visited = set(seed_files)
932
+ frontier = list(seed_files)
933
+ for _ in range(max_hops):
934
+ next_frontier = []
935
+ for f in frontier:
936
+ for neighbor in graph.successors(f):
937
+ if neighbor not in visited:
938
+ visited.add(neighbor)
939
+ next_frontier.append(neighbor)
940
+ frontier = next_frontier
941
+ return list(visited)''', language="python", label="adaptive_trigger.py β€” BFS Import Graph")
942
+
943
+ gr.HTML('<div class="section-header">TES Metric β€” Trade-off Efficiency Score</div>')
944
+ gr.Code(value='''def tes(recall_at_k: float, n_retrieved: int) -> float:
945
+ """
946
+ Trade-off Efficiency Score: balances recall against context size.
947
+
948
+ TES = Recall@K / ln(1 + |retrieved|)
949
+
950
+ Intuition: diminishing returns of loading more files.
951
+ - Loading 1 file: ln(2) = 0.693 penalty
952
+ - Loading 10 files: ln(11) = 2.398 penalty
953
+ - Loading ALL files: ln(1001) = 6.909 penalty ← Full Context collapses here
954
+
955
+ Results:
956
+ Strategy Recall@5 Token% TES
957
+ ──────────────── ────────── ────── ─────
958
+ Full Context 0.075 100.0% 0.019 ← bad: high penalty
959
+ BM25 0.982 18.7% 0.410
960
+ CTX (Ours) 0.874 5.2% 0.776 ← best: minimal files
961
+
962
+ Validated: Pearson r=0.87 with NDCG@5 (p<0.001, 28 strategy-dataset pairs)
963
+ """
964
+ if n_retrieved == 0:
965
+ return 0.0
966
+ return recall_at_k / math.log(1 + n_retrieved)
967
+
968
+
969
+ def adaptive_k(trigger_type: str, codebase_size: int) -> int:
970
+ """
971
+ Adaptive retrieval budget based on trigger type.
972
+ Symbol lookups need few files; dependency queries need more hops.
973
+ """
974
+ base = {
975
+ "EXPLICIT_SYMBOL": 3, # exact match β†’ few files
976
+ "TEMPORAL_HISTORY": 3, # recent changes β†’ few files
977
+ "SEMANTIC_CONCEPT": 5, # concept β†’ moderate
978
+ "IMPLICIT_CONTEXT": 7, # graph traversal β†’ more files
979
+ }
980
+ k = base.get(trigger_type, 5)
981
+ # Scale with codebase size (log scale)
982
+ if codebase_size > 500:
983
+ k = min(k + 2, 10)
984
+ return k''', language="python", label="metrics.py + adaptive_k")
985
+
986
+ gr.Markdown("""
987
+ ---
988
+ **Full source**: [github.com/jaytoone/CTX](https://github.com/jaytoone/CTX)
989
+
990
+ ```bash
991
+ git clone https://github.com/jaytoone/CTX && cd CTX
992
+ pip install -r requirements.txt
993
+ python run_experiment.py --dataset-size small --strategy all
994
+ ```
995
+ """)
996
+
997
+ # ── Tab 5: Citation ───────────────────────
998
  with gr.Tab("πŸ“š Citation"):
999
  gr.HTML('<div class="section-header">BibTeX</div>')
1000
  gr.Textbox(value=BIBTEX, label="BibTeX", lines=10, max_lines=15)