Spaces:
Paused
Paused
| from __future__ import annotations | |
| import math | |
| from dataclasses import dataclass, field | |
| class EvalMetrics: | |
| episodes: int = 0 | |
| success: int = 0 | |
| total_steps: int = 0 | |
| total_tool_calls: int = 0 | |
| total_redundant_tool_calls: int = 0 | |
| total_reward: float = 0.0 | |
| deanonymization_total: int = 0 | |
| deanonymization_success: int = 0 | |
| graph_f1_scores: list[float] = field(default_factory=list) | |
| total_knowledge_carrier: float = 0.0 | |
| total_knowledge_indexing: float = 0.0 | |
| total_connectivity: float = 0.0 | |
| total_format_reward: float = 0.0 | |
| total_relation_informativeness: float = 0.0 | |
| total_entity_informativeness: float = 0.0 | |
| total_diversity: float = 0.0 | |
| total_soft_shaping: float = 0.0 | |
| total_connectivity_gain: float = 0.0 | |
| total_compactness: float = 0.0 | |
| total_spawn_count: int = 0 | |
| total_spawn_finished_subtasks: int = 0 | |
| total_spawn_critical_steps: int = 0 | |
| def _sigmoid_temperature(value: float, temperature: float = 2.0) -> float: | |
| scaled = float(value) / max(1e-6, float(temperature)) | |
| if scaled >= 0: | |
| z = math.exp(-scaled) | |
| return 1.0 / (1.0 + z) | |
| z = math.exp(scaled) | |
| return z / (1.0 + z) | |
| def add(self, info: dict, task_type: str, graph_f1: float) -> None: | |
| self.episodes += 1 | |
| ok = info.get("agent_answer") == info.get("task_answer") | |
| self.success += int(ok) | |
| self.total_steps += int(info.get("step_count", 0)) | |
| self.total_tool_calls += int(info.get("tool_calls", 0)) | |
| self.total_redundant_tool_calls += int(info.get("redundant_tool_calls", 0)) | |
| self.total_reward += float(info.get("total_reward", 0.0)) | |
| self.graph_f1_scores.append(graph_f1) | |
| components = info.get("reward_components", {}) | |
| self.total_knowledge_carrier += float(components.get("knowledge_carrier", 0.0)) | |
| self.total_knowledge_indexing += float(components.get("knowledge_indexing", 0.0)) | |
| self.total_connectivity += float(components.get("connectivity", 0.0)) | |
| self.total_format_reward += float(components.get("format_reward", 0.0)) | |
| self.total_relation_informativeness += float(components.get("relation_informativeness", 0.0)) | |
| self.total_entity_informativeness += float(components.get("entity_informativeness", 0.0)) | |
| self.total_diversity += float(components.get("diversity", 0.0)) | |
| self.total_soft_shaping += float(components.get("soft_shaping", 0.0)) | |
| self.total_connectivity_gain += float(components.get("connectivity_gain", 0.0)) | |
| self.total_compactness += float(components.get("compactness", 0.0)) | |
| self.total_spawn_count += int(info.get("spawn_count", 0)) | |
| self.total_spawn_finished_subtasks += int(info.get("spawn_finished_subtasks", 0)) | |
| self.total_spawn_critical_steps += int(info.get("spawn_critical_steps", 0)) | |
| if task_type == "identity_resolution": | |
| self.deanonymization_total += 1 | |
| self.deanonymization_success += int(ok) | |
| def summary(self) -> dict: | |
| episodes = max(1, self.episodes) | |
| task_success_rate = self.success / episodes | |
| tool_efficiency = 1.0 - (self.total_redundant_tool_calls / max(1, self.total_tool_calls)) | |
| avg_graph_f1 = sum(self.graph_f1_scores) / max(1, len(self.graph_f1_scores)) | |
| deanonymization_accuracy = self.deanonymization_success / max(1, self.deanonymization_total) | |
| avg_reward_raw = self.total_reward / episodes | |
| avg_reward = self._sigmoid_temperature(avg_reward_raw, temperature=2.0) | |
| avg_knowledge_carrier = self.total_knowledge_carrier / episodes | |
| avg_knowledge_indexing = self.total_knowledge_indexing / episodes | |
| avg_connectivity = self.total_connectivity / episodes | |
| avg_relation_informativeness = self.total_relation_informativeness / episodes | |
| avg_entity_informativeness = self.total_entity_informativeness / episodes | |
| avg_diversity = self.total_diversity / episodes | |
| avg_soft_shaping = self.total_soft_shaping / episodes | |
| avg_connectivity_gain = self.total_connectivity_gain / episodes | |
| avg_compactness = self.total_compactness / episodes | |
| avg_spawn_count = self.total_spawn_count / episodes | |
| spawn_completion = self.total_spawn_finished_subtasks / max(1, self.total_spawn_count) | |
| avg_spawn_critical_steps = self.total_spawn_critical_steps / episodes | |
| spawn_latency_signal = 1.0 / max(1.0, avg_spawn_critical_steps) | |
| spawn_signal = max(0.0, min(1.0, 0.6 * spawn_completion + 0.4 * spawn_latency_signal)) | |
| reward_norm = avg_reward | |
| retrieval_signal = max(0.0, min(1.0, 0.5 + 0.35 * avg_knowledge_carrier + 0.35 * avg_knowledge_indexing)) | |
| structural_signal = max( | |
| 0.0, | |
| min( | |
| 1.0, | |
| 0.5 | |
| + 0.25 * avg_connectivity | |
| + 0.20 * avg_relation_informativeness | |
| + 0.20 * avg_entity_informativeness | |
| + 0.15 * avg_diversity | |
| + 0.10 * avg_connectivity_gain, | |
| ), | |
| ) | |
| leaderboard_score = ( | |
| 0.28 * task_success_rate | |
| + 0.20 * avg_graph_f1 | |
| + 0.12 * tool_efficiency | |
| + 0.12 * deanonymization_accuracy | |
| + 0.14 * retrieval_signal | |
| + 0.09 * structural_signal | |
| + 0.05 * reward_norm | |
| + 0.04 * spawn_signal | |
| ) | |
| return { | |
| "task_success_rate": task_success_rate, | |
| "tool_efficiency": tool_efficiency, | |
| "avg_graph_f1": avg_graph_f1, | |
| "avg_steps_to_solution": self.total_steps / episodes, | |
| "deanonymization_accuracy": deanonymization_accuracy, | |
| "avg_reward": avg_reward, | |
| "avg_knowledge_carrier_reward": avg_knowledge_carrier, | |
| "avg_knowledge_indexing_reward": avg_knowledge_indexing, | |
| "avg_connectivity_reward": avg_connectivity, | |
| "avg_format_reward": self.total_format_reward / episodes, | |
| "avg_relation_informativeness_reward": avg_relation_informativeness, | |
| "avg_entity_informativeness_reward": avg_entity_informativeness, | |
| "avg_diversity_reward": avg_diversity, | |
| "avg_soft_shaping_reward": avg_soft_shaping, | |
| "avg_connectivity_gain_reward": avg_connectivity_gain, | |
| "avg_compactness_reward": avg_compactness, | |
| "avg_spawn_count": avg_spawn_count, | |
| "spawn_completion_rate": spawn_completion, | |
| "avg_spawn_critical_steps": avg_spawn_critical_steps, | |
| "spawn_signal": spawn_signal, | |
| "retrieval_signal": retrieval_signal, | |
| "structural_signal": structural_signal, | |
| "leaderboard_score": leaderboard_score, | |
| } | |