Spaces:

Chirag0123
/

codebase-nav-env

Sleeping

App Files Files Community

codebase-nav-env / server /analytics_engine.py

Chirag0123

v4 Research Modules & Pre-submission tweaks

0b0338d 10 days ago

raw

history blame contribute delete

24.3 kB

	# server/analytics_engine.py
	"""
	Unified Analytics Engine — v4.0

	Aggregates ALL scoring dimensions into a single research-grade report.
	Produces:
	- Reasoning graph (structured DAG of the agent's decision process)
	- Root cause analysis (why the agent failed at every level)
	- Decision efficiency score
	- Overall AI reliability profile (radar chart data)
	- Paper-ready JSON suitable for arXiv submission

	This module is the "top of the stack" — it calls all other engines
	and synthesizes their outputs into one authoritative report.
	"""
	from __future__ import annotations
	import time
	import json
	from typing import List, Dict, Any, Optional, Tuple
	from dataclasses import dataclass, field


	@dataclass
	class ReasoningNode:
	"""One node in the agent's reconstructed reasoning graph."""
	node_id: str
	step_number: int
	action_type: str
	target: Optional[str] # file path or search query
	reward: float
	was_useful: bool
	connected_to: List[str] # IDs of subsequent nodes that built on this
	label: str # Human-readable description


	@dataclass
	class ReasoningGraph:
	"""
	A directed graph reconstruction of the agent's thought process.

	Nodes = actions taken.
	Edges = "built on" relationships (e.g., write followed a read = used info from read).
	Clusters = logical reasoning phases (Exploration, Hypothesis, Verification, Commit)
	"""
	nodes: List[ReasoningNode]
	phases: Dict[str, List[str]] # phase_name → [node_ids]
	critical_path: List[str] # node_ids on the most impactful path
	wasted_nodes: List[str] # node_ids that contributed nothing
	optimal_path_comparison: Optional[str] # What should the agent have done

	def to_dict(self) -> dict:
	return {
	"nodes": [
	{
	"id": n.node_id, "step": n.step_number,
	"action": n.action_type, "target": n.target,
	"reward": round(n.reward, 3), "useful": n.was_useful,
	"connects_to": n.connected_to, "label": n.label,
	}
	for n in self.nodes
	],
	"phases": self.phases,
	"critical_path": self.critical_path,
	"wasted_nodes": self.wasted_nodes,
	"optimal_path": self.optimal_path_comparison,
	}


	@dataclass
	class AnalyticsReport:
	"""
	The master analytics report — synthesizes all evaluation dimensions.
	Paper-ready, structured for research publication or leaderboard submission.
	"""
	report_id: str
	episode_id: str
	task: str
	variant_id: str
	generated_at: float

	# Dimension scores (0.0–1.0 each)
	correctness_score: float # Did it fix the bug?
	causal_score: float # Did it understand WHY?
	robustness_score: float # Is the strategy resilient?
	calibration_score: float # Was it appropriately confident?
	reliability_index: float # Weighted multi-dim score
	generalization_hint: float # Based on strategy (robust strategies generalize better)
	decision_efficiency: float # Score / Steps ratio (normalized)
	process_quality: float # How structured was the reasoning process?

	# Composite
	composite_score: float # Weighted aggregate of all dimensions

	# Graph
	reasoning_graph: ReasoningGraph

	# Root cause trees
	failure_root_causes: List[Dict] # Each: {cause, effect, evidence, depth}

	# Alternative path analysis
	what_agent_did: List[str]
	what_agent_should_have_done: List[str]
	steps_wasted: int
	steps_optimal: int

	# Profile tags
	profile_tags: List[str] # e.g., ["OVERCONFIDENT", "SHORTCUT_LEARNER", "WELL_CALIBRATED"]

	# Executive summary
	executive_summary: str
	researcher_notes: str # More technical deep dive

	def to_dict(self) -> dict:
	return {
	"report_id": self.report_id,
	"episode_id": self.episode_id,
	"task": self.task,
	"variant_id": self.variant_id,
	"generated_at": self.generated_at,
	"dimension_scores": {
	"correctness": round(self.correctness_score, 3),
	"causal_reasoning": round(self.causal_score, 3),
	"robustness": round(self.robustness_score, 3),
	"calibration": round(self.calibration_score, 3),
	"reliability_index": round(self.reliability_index, 3),
	"generalization": round(self.generalization_hint, 3),
	"decision_efficiency": round(self.decision_efficiency, 3),
	"process_quality": round(self.process_quality, 3),
	"composite": round(self.composite_score, 3),
	},
	"reasoning_graph": self.reasoning_graph.to_dict(),
	"failure_root_causes": self.failure_root_causes,
	"alternative_paths": {
	"what_agent_did": self.what_agent_did,
	"optimal_path": self.what_agent_should_have_done,
	"steps_wasted": self.steps_wasted,
	"steps_optimal": self.steps_optimal,
	},
	"profile_tags": self.profile_tags,
	"executive_summary": self.executive_summary,
	"researcher_notes": self.researcher_notes,
	}

	def render_text(self) -> str:
	"""Render a human-readable analytics report."""
	def bar(v: float, width: int = 20) -> str:
	filled = int(v * width)
	return "█" * filled + "░" * (width - filled)

	lines = [
	f"{'═'*70}",
	f" 📈 ANALYTICS ENGINE REPORT — {self.task} \| {self.variant_id}",
	f" Episode: {self.episode_id}",
	f"{'═'*70}",
	"",
	"┌─ DIMENSION SCORES ─────────────────────────────────────────────────",
	f"│ Correctness [{bar(self.correctness_score)}] {self.correctness_score:.3f}",
	f"│ Causal Reasoning [{bar(self.causal_score)}] {self.causal_score:.3f}",
	f"│ Robustness [{bar(self.robustness_score)}] {self.robustness_score:.3f}",
	f"│ Calibration [{bar(self.calibration_score)}] {self.calibration_score:.3f}",
	f"│ Reliability [{bar(self.reliability_index)}] {self.reliability_index:.3f}",
	f"│ Decision Effic. [{bar(self.decision_efficiency)}] {self.decision_efficiency:.3f}",
	f"│ Process Quality [{bar(self.process_quality)}] {self.process_quality:.3f}",
	f"│ {'─'*60}",
	f"│ COMPOSITE [{bar(self.composite_score)}] {self.composite_score:.3f}",
	"└────────────────────────────────────────────────────────────────────",
	"",
	]

	if self.profile_tags:
	lines.append(f"🏷️ Profile: {' \| '.join(self.profile_tags)}")
	lines.append("")

	lines += [
	"📝 Executive Summary",
	f" {self.executive_summary}",
	"",
	]

	if self.failure_root_causes:
	lines.append("🔥 Failure Root Cause Analysis")
	for rc in self.failure_root_causes[:3]:
	lines.append(f" Cause: {rc.get('cause')}")
	lines.append(f" Effect: {rc.get('effect')}")
	lines.append(f" Fix: {rc.get('remediation')}")
	lines.append("")

	lines += [
	"🗺️ What Agent Did vs Optimal",
	f" Steps taken: {len(self.what_agent_did)} \| Steps optimal: {self.steps_optimal} \| Wasted: {self.steps_wasted}",
	]
	for a, o in zip(
	self.what_agent_did[:5],
	self.what_agent_should_have_done[:5],
	):
	prefix_a = " ✓" if a == o else " ✗"
	lines.append(f" Agent: {a}")
	lines.append(f" Optimal: {o}")
	lines.append("")

	if self.researcher_notes:
	lines += ["🔬 Researcher Notes", f" {self.researcher_notes}", ""]

	lines.append(f"{'═'*70}")
	return "\n".join(lines)


	class AnalyticsEngine:
	"""
	Master analytics engine — integrates all evaluation modules.

	Call .analyze() after an episode to get the full AnalyticsReport.
	"""

	def analyze(
	self,
	env,
	causal_report=None,
	counterfactual_report=None,
	calibration_report=None,
	advanced_metrics=None,
	failure_report=None,
	strategy_report=None,
	) -> AnalyticsReport:
	"""
	Synthesize all evaluation outputs into one AnalyticsReport.
	Each sub-report is optional — we gracefully handle None.
	"""
	import uuid

	traj = env.get_trajectory()
	steps = traj.get("steps", []) if traj else []
	meta = env.variant.meta if env.variant else {}
	episode_id = traj.get("episode_id", "unknown") if traj else "unknown"
	variant_id = traj.get("variant_id", "unknown") if traj else "unknown"
	task = env.current_task or "unknown"
	final_score = env.final_score
	files_read = list(env.files_read)
	files_written = list(env.files_written)

	# ── Run sub-engines if reports not provided ────────────────────────────
	if causal_report is None:
	from server.causal_probe import CausalProbe
	causal_report = CausalProbe().probe(
	episode_id, task, steps, meta, files_read, files_written, final_score
	)
	if counterfactual_report is None:
	from server.counterfactual_engine import CounterfactualEngine
	counterfactual_report = CounterfactualEngine().analyze(
	episode_id, task, steps, meta, files_read, files_written, final_score
	)
	if calibration_report is None:
	from server.confidence_calibrator import ConfidenceCalibrator
	calibration_report = ConfidenceCalibrator().calibrate(
	episode_id, task, steps, final_score
	)
	if advanced_metrics is None:
	from server.advanced_metrics import AdvancedMetricsEngine
	advanced_metrics = AdvancedMetricsEngine().compute(
	steps, meta, final_score, files_read, files_written
	)
	if failure_report is None:
	from server.failure_classifier import FailureClassifier
	failure_report = FailureClassifier().classify(
	episode_id, task, steps, meta, files_read, files_written, final_score
	)
	if strategy_report is None:
	from server.strategy_detector import StrategyDetector
	strategy_report = StrategyDetector().detect(
	steps, task, meta, files_read, final_score
	)

	# ── Compute derived scores ─────────────────────────────────────────────
	causal_score = causal_report.causal_score
	robustness_score = counterfactual_report.robustness_score
	calibration_score = calibration_report.calibration_score
	reliability_index = advanced_metrics.reliability_index
	correctness_score = final_score

	# Decision efficiency: correctness per step, normalized
	total_steps = max(len(steps), 1)
	max_steps_possible = meta.get("max_steps", 20)
	decision_efficiency = (
	final_score /
	max(1.0, total_steps / max(1, max_steps_possible / 3))
	)
	decision_efficiency = min(1.0, decision_efficiency)

	# Process quality: measures structural quality of reasoning process
	read_before_write = causal_report.read_before_write
	tested_before_submit = causal_report.submit_after_test
	used_search = causal_report.search_before_navigate
	full_chain = causal_report.actual_chain_coverage
	process_quality = (
	(0.25 if read_before_write else 0.0) +
	(0.25 if tested_before_submit else 0.0) +
	(0.20 if used_search else 0.0) +
	full_chain * 0.30
	)

	# Generalization hint from strategy robustness
	strategy_generalization_map = {
	"TARGETED_DEBUGGING": 0.75,
	"SYSTEMATIC_SEARCH": 0.70,
	"SPEC_DRIVEN": 0.80,
	"BRUTE_FORCE": 0.40,
	"RANDOM_EXPLORATION": 0.30,
	"MINIMAL_EFFORT": 0.20,
	}
	generalization_hint = strategy_generalization_map.get(strategy_report.strategy, 0.5)
	generalization_hint = (generalization_hint + robustness_score) / 2

	# Composite (research-grade weighted aggregate)
	composite_score = (
	correctness_score * 0.30 +
	causal_score * 0.20 +
	robustness_score * 0.15 +
	calibration_score * 0.12 +
	reliability_index * 0.10 +
	process_quality * 0.08 +
	decision_efficiency * 0.05
	)

	# ── Build reasoning graph ──────────────────────────────────────────────
	reasoning_graph = self._build_reasoning_graph(steps, meta, files_read, files_written)

	# ── Root cause analysis ────────────────────────────────────────────────
	root_causes = self._build_root_cause_tree(
	failure_report, causal_report, calibration_report, final_score
	)

	# ── Alternative path analysis ─────────────────────────────────────────
	what_did = [
	f"{s.get('action_type')} {s.get('action_path') or s.get('action_query') or ''}".strip()
	for s in steps
	]
	optimal = self._compute_optimal_path(meta, files_read, files_written, final_score)
	steps_wasted = max(0, total_steps - len(optimal))

	# ── Profile tags ───────────────────────────────────────────────────────
	tags = []
	if calibration_report.profile.value != "WELL_CALIBRATED":
	tags.append(calibration_report.profile.value)
	if causal_report.shortcut_learning_detected:
	tags.append("SHORTCUT_LEARNER")
	if causal_report.false_confidence_detected:
	tags.append("FALSE_CONFIDENCE")
	if counterfactual_report.brittleness_level.value in ("BRITTLE", "FRAGILE"):
	tags.append(f"BRITTLE_STRATEGY_{counterfactual_report.brittleness_level.value}")
	if causal_report.understanding_level.value == "DEEP":
	tags.append("DEEP_REASONER")
	if strategy_report.strategy == "TARGETED_DEBUGGING":
	tags.append("TARGETED_DEBUGGER")
	if not tags:
	tags.append("TYPICAL")

	# ── Executive summary ──────────────────────────────────────────────────
	summary_parts = [
	f"Agent scored {final_score:.2f} on {task}.",
	f"Causal understanding: {causal_report.understanding_level.value} ({causal_score:.2f}).",
	f"Strategy: {strategy_report.strategy} (robustness: {robustness_score:.2f}).",
	f"Confidence calibration: {calibration_report.profile.value} (error: {calibration_report.expected_calibration_error:.2f}).",
	f"Composite reliability: {composite_score:.2f}.",
	]
	executive_summary = " ".join(summary_parts)

	# ── Researcher notes ───────────────────────────────────────────────────
	researcher_notes = (
	f"Observed {total_steps} steps ({steps_wasted} wasted vs estimated {len(optimal)} optimal). "
	f"Chain coverage: {causal_report.actual_chain_coverage:.0%}. "
	f"Chain order score: {causal_report.chain_order_score:.2f}. "
	f"Counterfactual mutations survived: {counterfactual_report.mutations_survived}/{len(counterfactual_report.mutations_tested)}. "
	f"Expected calibration error: {calibration_report.expected_calibration_error:.3f}. "
	f"Decision efficiency: {decision_efficiency:.3f}. "
	f"Process quality: {process_quality:.3f}."
	)

	return AnalyticsReport(
	report_id=f"ar_{uuid.uuid4().hex[:10]}",
	episode_id=episode_id,
	task=task,
	variant_id=variant_id,
	generated_at=time.time(),
	correctness_score=correctness_score,
	causal_score=causal_score,
	robustness_score=robustness_score,
	calibration_score=calibration_score,
	reliability_index=reliability_index,
	generalization_hint=generalization_hint,
	decision_efficiency=decision_efficiency,
	process_quality=process_quality,
	composite_score=composite_score,
	reasoning_graph=reasoning_graph,
	failure_root_causes=root_causes,
	what_agent_did=what_did,
	what_agent_should_have_done=optimal,
	steps_wasted=steps_wasted,
	steps_optimal=len(optimal),
	profile_tags=tags,
	executive_summary=executive_summary,
	researcher_notes=researcher_notes,
	)

	def _build_reasoning_graph(
	self,
	steps: List[dict],
	meta: dict,
	files_read: List[str],
	files_written: List[str],
	) -> ReasoningGraph:
	"""Build a DAG from the trajectory steps."""
	bug_files = set(meta.get("bug_files", []) + meta.get("files_to_implement", []))

	nodes: List[ReasoningNode] = []
	phases: Dict[str, List[str]] = {
	"Exploration": [], "Hypothesis": [], "Verification": [], "Commit": []
	}
	files_read_set = set()
	last_useful_node_id: Optional[str] = None
	all_node_ids: List[str] = []

	for s in steps:
	node_id = f"n{s.get('step_number', len(nodes)+1)}"
	atype = s.get("action_type", "unknown")
	target = s.get("action_path") or s.get("action_query")
	reward = s.get("reward", 0.0)

	# Determine usefulness
	was_useful = (
	reward > 0 or
	(atype == "read_file" and target in bug_files) or
	(atype == "search_code") or
	(atype == "run_tests") or
	(atype == "submit" and reward > 0)
	)

	# Determine phase
	if atype in ("read_file", "search_code"):
	phase = "Exploration"
	elif atype == "write_file":
	phase = "Hypothesis"
	elif atype == "run_tests":
	phase = "Verification"
	else:
	phase = "Commit"

	# Build label
	short_target = (target.split("/")[-1] if target else "")[:20] if target else ""
	label = f"{atype}({short_target})" if short_target else atype

	# Connections: link to previous useful node
	connects_to = [last_useful_node_id] if last_useful_node_id and was_useful else []
	connects_to = [c for c in connects_to if c]

	node = ReasoningNode(
	node_id=node_id,
	step_number=s.get("step_number", len(nodes) + 1),
	action_type=atype,
	target=target,
	reward=reward,
	was_useful=was_useful,
	connected_to=connects_to,
	label=label,
	)
	nodes.append(node)
	phases[phase].append(node_id)
	all_node_ids.append(node_id)
	if was_useful:
	last_useful_node_id = node_id

	# Critical path: nodes with positive reward or that led to the final submit
	critical_path = [n.node_id for n in nodes if n.reward > 0 or n.action_type == "submit"]
	wasted_nodes = [n.node_id for n in nodes if not n.was_useful and n.action_type != "submit"]

	# Optimal path comparison
	optimal_actions = []
	test_files = [f for f in (list(files_read) + list(bug_files)) if "test" in f.lower()]
	src_files = [f for f in (list(files_read) + list(bug_files)) if f not in test_files]
	for tf in test_files[:1]:
	optimal_actions.append(f"read_file({tf.split('/')[-1]})")
	for sf in src_files[:2]:
	optimal_actions.append(f"read_file({sf.split('/')[-1]})")
	optimal_actions += ["write_file(src)", "run_tests", "submit"]
	optimal_path = " → ".join(optimal_actions)

	return ReasoningGraph(
	nodes=nodes,
	phases={k: v for k, v in phases.items() if v},
	critical_path=critical_path,
	wasted_nodes=wasted_nodes,
	optimal_path_comparison=optimal_path,
	)

	def _build_root_cause_tree(
	self, failure_report, causal_report, calibration_report, final_score: float
	) -> List[Dict]:
	"""Build a structured root cause tree."""
	causes = []

	if failure_report and failure_report.failures:
	for f in failure_report.failures[:3]:
	causes.append({
	"depth": "primary",
	"cause": f.failure_type if hasattr(f, "failure_type") else str(f),
	"effect": f.evidence if hasattr(f, "evidence") else "unknown",
	"remediation": f.remediation if hasattr(f, "remediation") else "See improvement plan",
	})
	elif final_score < 0.5:
	causes.append({
	"depth": "primary",
	"cause": failure_report.primary_failure if failure_report else "LOW_SCORE",
	"effect": f"Final score only {final_score:.2f} — bug not adequately fixed",
	"remediation": "Use test-first navigation and verify with run_tests",
	})

	if causal_report and causal_report.guessing_indicators:
	for ind in causal_report.guessing_indicators[:2]:
	causes.append({
	"depth": "secondary",
	"cause": "CAUSAL_GAP",
	"effect": ind,
	"remediation": causal_report.recommendations[0] if causal_report.recommendations else "",
	})

	if calibration_report and calibration_report.profile.value == "OVERCONFIDENT":
	causes.append({
	"depth": "secondary",
	"cause": "OVERCONFIDENCE",
	"effect": f"Inferred confidence {calibration_report.inferred_confidence:.2f} vs actual {calibration_report.actual_performance:.2f}",
	"remediation": "Read more before committing. Verify with tests.",
	})

	return causes

	def _compute_optimal_path(
	self, meta: dict, files_read: List[str], files_written: List[str], score: float
	) -> List[str]:
	"""Suggest what the optimal action sequence would have been."""
	test_files = [f for f in files_read if "test" in f.lower()]
	bug_files = meta.get("bug_files", []) or meta.get("files_to_implement", [])

	path = []
	for tf in (test_files or ["tests/test_main.py"])[:1]:
	path.append(f"read_file {tf}")
	for bf in (bug_files or ["src/main.py"])[:2]:
	path.append(f"read_file {bf}")
	path.append("search_code <function_name>")
	path.append("write_file <targeted_fix>")
	path.append("run_tests")
	path.append("submit")
	return path