Spaces:
Sleeping
Sleeping
Commit ·
ce675d4
1
Parent(s): d61a550
Add dashboard, Update reward, Multi-agent orchestration
Browse files- README.md +274 -42
- artifacts/leaderboard.json +84 -0
- artifacts/osint_dashboard.html +551 -0
- config/seed_example.json +51 -0
- config/shared_config.json +40 -0
- docs/reward_design_notes.md +94 -0
- src/osint_env/agents/__init__.py +5 -0
- src/osint_env/agents/swarm_agent.py +181 -0
- src/osint_env/cli.py +208 -5
- src/osint_env/config/__init__.py +9 -0
- src/osint_env/config/shared.py +226 -0
- src/osint_env/data/generator.py +364 -26
- src/osint_env/domain/models.py +57 -0
- src/osint_env/env/environment.py +66 -9
- src/osint_env/env/reward.py +406 -1
- src/osint_env/env/spawn_reward_hooks.py +93 -0
- src/osint_env/eval/leaderboard.py +83 -0
- src/osint_env/eval/metrics.py +94 -5
- src/osint_env/eval/runner.py +28 -4
- src/osint_env/viz/__init__.py +3 -0
- src/osint_env/viz/dashboard.py +707 -0
- tests/test_config.py +61 -0
- tests/test_dashboard.py +25 -0
- tests/test_eval.py +12 -1
- tests/test_leaderboard.py +47 -0
- tests/test_reward.py +53 -0
- tests/test_seeding.py +40 -0
- tests/test_spawn_reward_hooks.py +43 -0
- tests/test_swarm_agent.py +17 -0
README.md
CHANGED
|
@@ -1,42 +1,274 @@
|
|
| 1 |
-
# OSINT RL Environment
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OSINT RL Environment
|
| 2 |
+
|
| 3 |
+
This repository implements a simulated OSINT-style reinforcement learning environment where agents build and query a knowledge graph over fragmented multi-platform synthetic data.
|
| 4 |
+
|
| 5 |
+
The codebase now supports both single-agent and low-width multi-agent swarm execution, seeded task and graph bootstrapping, benchmark scoring, and interactive visualization.
|
| 6 |
+
|
| 7 |
+
## 1. What The Project Does
|
| 8 |
+
|
| 9 |
+
The environment models a realistic workflow for information discovery and linking:
|
| 10 |
+
|
| 11 |
+
1. Generate a hidden canonical graph with users, aliases, organizations, locations, and links.
|
| 12 |
+
2. Project noisy partial views into mock platforms (microblog, forum, profile).
|
| 13 |
+
3. Ask identity-resolution, network-discovery, and event-tracing questions.
|
| 14 |
+
4. Let agents call tools, add graph edges, and submit answers.
|
| 15 |
+
5. Score episodes using a composite reward that combines correctness, retrieval utility, graph quality, and efficiency.
|
| 16 |
+
|
| 17 |
+
## 2. Current Capabilities
|
| 18 |
+
|
| 19 |
+
- Single-agent baseline runner.
|
| 20 |
+
- Multi-agent swarm runner with constrained breadth and width (configurable, low by default).
|
| 21 |
+
- Seeded graph nodes and edges from user-provided JSON.
|
| 22 |
+
- Seeded questions from user-provided JSON.
|
| 23 |
+
- LLM-assisted generation hooks for remaining graph/task expansion with deterministic fallback.
|
| 24 |
+
- Persistent benchmark leaderboard with composite utility score.
|
| 25 |
+
- Interactive dashboard showing:
|
| 26 |
+
- canonical graph,
|
| 27 |
+
- episode graph diff (predicted vs truth),
|
| 28 |
+
- source database explorer,
|
| 29 |
+
- benchmark charts and leaderboard.
|
| 30 |
+
|
| 31 |
+
## 3. Installation
|
| 32 |
+
|
| 33 |
+
Environment setup from the project root:
|
| 34 |
+
|
| 35 |
+
1. Activate your Python environment.
|
| 36 |
+
2. Install package dependencies.
|
| 37 |
+
|
| 38 |
+
Example:
|
| 39 |
+
|
| 40 |
+
source ~/arl/bin/activate
|
| 41 |
+
uv pip install -e .
|
| 42 |
+
|
| 43 |
+
The project requires Python 3.10+.
|
| 44 |
+
|
| 45 |
+
## 4. Repository Layout
|
| 46 |
+
|
| 47 |
+
src/osint_env/
|
| 48 |
+
agents/ single-agent and swarm runners
|
| 49 |
+
config/ shared config loader
|
| 50 |
+
data/ canonical graph, views, and task generation
|
| 51 |
+
domain/ data models and configuration dataclasses
|
| 52 |
+
env/ OpenEnv environment and reward logic
|
| 53 |
+
eval/ metrics, runner, leaderboard
|
| 54 |
+
llm/ LLM client interface and local mock
|
| 55 |
+
memory/ in-memory KG and semantic memory
|
| 56 |
+
platforms/ platform tool APIs
|
| 57 |
+
viz/ dashboard export
|
| 58 |
+
cli.py command-line entrypoint
|
| 59 |
+
|
| 60 |
+
config/
|
| 61 |
+
shared_config.json shared runtime/environment/swarm/reward config
|
| 62 |
+
seed_example.json example seeded graph and question file
|
| 63 |
+
|
| 64 |
+
## 5. Shared Configuration
|
| 65 |
+
|
| 66 |
+
All core knobs are centralized in config/shared_config.json.
|
| 67 |
+
|
| 68 |
+
This file includes:
|
| 69 |
+
|
| 70 |
+
- environment generation controls,
|
| 71 |
+
- swarm limits,
|
| 72 |
+
- spawn reward shaping hyperparameters,
|
| 73 |
+
- seeding defaults,
|
| 74 |
+
- runtime output paths.
|
| 75 |
+
|
| 76 |
+
Default swarm settings are intentionally conservative:
|
| 77 |
+
|
| 78 |
+
- max_agents: 3
|
| 79 |
+
- max_breadth: 2
|
| 80 |
+
- max_width: 2
|
| 81 |
+
- max_depth: 2
|
| 82 |
+
|
| 83 |
+
These defaults keep orchestration cost and branching low while enabling swarm behavior.
|
| 84 |
+
|
| 85 |
+
## 6. Seeding Questions And Partial Graphs
|
| 86 |
+
|
| 87 |
+
You can manually seed:
|
| 88 |
+
|
| 89 |
+
- graph nodes,
|
| 90 |
+
- graph edges,
|
| 91 |
+
- task questions (optionally with answers and supporting edges).
|
| 92 |
+
|
| 93 |
+
Use a seed file with the same structure as config/seed_example.json and pass it using --seed-file.
|
| 94 |
+
|
| 95 |
+
Workflow:
|
| 96 |
+
|
| 97 |
+
1. Add your manual graph fragments and questions to a JSON file.
|
| 98 |
+
2. Keep llm_generate_remaining_graph and llm_generate_remaining_tasks enabled to fill the rest automatically.
|
| 99 |
+
3. Run demo/eval/benchmark with --seed-file.
|
| 100 |
+
|
| 101 |
+
## 7. CLI Usage
|
| 102 |
+
|
| 103 |
+
All commands accept:
|
| 104 |
+
|
| 105 |
+
- --config for shared config path (default: config/shared_config.json)
|
| 106 |
+
- --seed-file for seeded graph/task input JSON
|
| 107 |
+
- --agent-mode with values: config, single, swarm
|
| 108 |
+
|
| 109 |
+
Main commands:
|
| 110 |
+
|
| 111 |
+
1. Run one episode:
|
| 112 |
+
|
| 113 |
+
osint-env demo --agent-mode swarm
|
| 114 |
+
|
| 115 |
+
2. Evaluate episodes:
|
| 116 |
+
|
| 117 |
+
osint-env eval --episodes 20 --agent-mode single
|
| 118 |
+
|
| 119 |
+
3. Benchmark and export dashboard:
|
| 120 |
+
|
| 121 |
+
osint-env benchmark --episodes 20 --name baseline_swarm
|
| 122 |
+
|
| 123 |
+
4. Multi-seed benchmark sweep:
|
| 124 |
+
|
| 125 |
+
osint-env benchmark-sweep --seeds 7,11,17,23,31 --name-prefix sweep_swarm
|
| 126 |
+
|
| 127 |
+
5. Print leaderboard:
|
| 128 |
+
|
| 129 |
+
osint-env leaderboard --sort-by leaderboard_score --top 15
|
| 130 |
+
|
| 131 |
+
6. Export explorer without full benchmark:
|
| 132 |
+
|
| 133 |
+
osint-env viz --with-demo --output artifacts/osint_explorer.html
|
| 134 |
+
|
| 135 |
+
## 8. Multi-Agent Swarm Design
|
| 136 |
+
|
| 137 |
+
Swarm orchestration is implemented in src/osint_env/agents/swarm_agent.py.
|
| 138 |
+
|
| 139 |
+
Design choices:
|
| 140 |
+
|
| 141 |
+
- Shared environment state (single episode state machine).
|
| 142 |
+
- Planner rounds bounded by max_depth and planner_rounds.
|
| 143 |
+
- Parallel workers bounded by min(max_agents, max_breadth, max_width).
|
| 144 |
+
- Each worker performs limited tool calls, then attempts edge addition.
|
| 145 |
+
- Final answer is submitted once planning rounds complete or episode ends.
|
| 146 |
+
|
| 147 |
+
Reward compatibility:
|
| 148 |
+
|
| 149 |
+
- Existing edge and answer reward components are unchanged.
|
| 150 |
+
- Spawn utility is added as an auxiliary term using the PARL-style helper in src/osint_env/env/spawn_reward_hooks.py.
|
| 151 |
+
- Spawn telemetry (count, critical steps, completion) is tracked in episode info and evaluation summaries.
|
| 152 |
+
|
| 153 |
+
## 9. Reward Design (Integrated Notes)
|
| 154 |
+
|
| 155 |
+
The reward function is a composite of graph-construction and answer-time utility terms. It combines ideas from DeepPath, EMNLP 2018 reward shaping, UniRel, and AutoGraph-R1.
|
| 156 |
+
|
| 157 |
+
### 9.1 Edge Reward During Graph Construction
|
| 158 |
+
|
| 159 |
+
For each ADD_EDGE action, the environment combines:
|
| 160 |
+
|
| 161 |
+
1. Global accuracy signal (DeepPath-style positive/negative credit).
|
| 162 |
+
2. Soft shaping term inspired by EMNLP 2018 reward shaping:
|
| 163 |
+
|
| 164 |
+
R = Rb + (1 - Rb) f(s, r, o)
|
| 165 |
+
|
| 166 |
+
where f is approximated in code with relation and type priors plus small domain priors.
|
| 167 |
+
|
| 168 |
+
3. Efficiency bonus inversely proportional to step count.
|
| 169 |
+
4. Diversity bonus using signature novelty against previous edges.
|
| 170 |
+
5. Relation informativeness using normalized relation IDF.
|
| 171 |
+
6. Entity informativeness using inverse hubness penalty.
|
| 172 |
+
7. Connectivity gain bonus for bridge-style edges.
|
| 173 |
+
|
| 174 |
+
### 9.2 Final Answer Reward
|
| 175 |
+
|
| 176 |
+
For ANSWER, reward includes:
|
| 177 |
+
|
| 178 |
+
1. format validity,
|
| 179 |
+
2. correctness,
|
| 180 |
+
3. knowledge-carrying utility (AutoGraph-style deducibility),
|
| 181 |
+
4. knowledge-indexing utility (AutoGraph-style evidence coverage proxy over tool outputs),
|
| 182 |
+
5. UniRel-style connectivity score over seed entities,
|
| 183 |
+
6. graph F1 against supporting edges,
|
| 184 |
+
7. compactness and repetition controls,
|
| 185 |
+
8. efficiency and informativeness terms.
|
| 186 |
+
|
| 187 |
+
### 9.3 Swarm Auxiliary Reward
|
| 188 |
+
|
| 189 |
+
The swarm runner adds a PARL-style auxiliary term based on:
|
| 190 |
+
|
| 191 |
+
- spawn parallelism,
|
| 192 |
+
- finished subtask ratio,
|
| 193 |
+
- critical-step latency proxy,
|
| 194 |
+
- optional breadth and depth shaping.
|
| 195 |
+
|
| 196 |
+
This auxiliary term is configurable in shared_config.json via spawn_reward.
|
| 197 |
+
|
| 198 |
+
### 9.4 Benchmark Metrics
|
| 199 |
+
|
| 200 |
+
Evaluation tracks:
|
| 201 |
+
|
| 202 |
+
- task success,
|
| 203 |
+
- graph F1,
|
| 204 |
+
- deanonymization accuracy,
|
| 205 |
+
- tool efficiency,
|
| 206 |
+
- retrieval and structural utility signals,
|
| 207 |
+
- spawn signals (for swarm runs),
|
| 208 |
+
- composite leaderboard score.
|
| 209 |
+
|
| 210 |
+
## 10. Interactive Dashboard
|
| 211 |
+
|
| 212 |
+
Dashboard export includes:
|
| 213 |
+
|
| 214 |
+
- canonical graph explorer,
|
| 215 |
+
- episode graph comparison,
|
| 216 |
+
- node and edge inspectors,
|
| 217 |
+
- source database table with record detail pane,
|
| 218 |
+
- reward and graph traces,
|
| 219 |
+
- sortable leaderboard snapshot.
|
| 220 |
+
|
| 221 |
+
Primary outputs:
|
| 222 |
+
|
| 223 |
+
- artifacts/osint_dashboard.html
|
| 224 |
+
- artifacts/osint_explorer.html
|
| 225 |
+
- artifacts/sweep_dashboards/*.html
|
| 226 |
+
|
| 227 |
+
## 11. Notes On LLM Generation
|
| 228 |
+
|
| 229 |
+
Dataset generation supports an LLM-assisted expansion path for remaining tasks and graph edges.
|
| 230 |
+
|
| 231 |
+
If no model is connected or structured output is unavailable, deterministic template fallback is used. This preserves reproducibility while keeping the interface compatible with stronger local or remote LLMs.
|
| 232 |
+
|
| 233 |
+
## 12. Citation And Source Papers
|
| 234 |
+
|
| 235 |
+
Reward components and swarm hooks are informed by the following papers:
|
| 236 |
+
|
| 237 |
+
1. AutoGraph-R1: Enhancing Agentic RAG with Graph-R1 for Complex QA.
|
| 238 |
+
arXiv: https://arxiv.org/abs/2510.15339
|
| 239 |
+
|
| 240 |
+
2. UniRel: Graph-based Relational Retrieval for LLM Reasoning.
|
| 241 |
+
arXiv: https://arxiv.org/abs/2512.17043
|
| 242 |
+
|
| 243 |
+
3. DeepPath: A Reinforcement Learning Method for Knowledge Graph Reasoning.
|
| 244 |
+
EMNLP 2017: https://aclanthology.org/D17-1060/
|
| 245 |
+
|
| 246 |
+
4. Multi-Hop Knowledge Graph Reasoning with Reward Shaping.
|
| 247 |
+
EMNLP 2018: https://aclanthology.org/D18-1362/
|
| 248 |
+
|
| 249 |
+
5. Kimi K2.5 (PARL-style multi-agent shaping motivation).
|
| 250 |
+
arXiv: https://arxiv.org/abs/2602.02276
|
| 251 |
+
|
| 252 |
+
Additional context:
|
| 253 |
+
|
| 254 |
+
6. MINERVA: Reinforcement Learning for Query Answering over Knowledge Graphs.
|
| 255 |
+
arXiv: https://arxiv.org/abs/1711.05851
|
| 256 |
+
|
| 257 |
+
## 13. Development And Testing
|
| 258 |
+
|
| 259 |
+
Run tests from project root:
|
| 260 |
+
|
| 261 |
+
pytest -q
|
| 262 |
+
|
| 263 |
+
Recommended validation after config changes:
|
| 264 |
+
|
| 265 |
+
1. osint-env demo --agent-mode swarm
|
| 266 |
+
2. osint-env eval --episodes 5
|
| 267 |
+
3. osint-env benchmark --episodes 5 --name quick_check
|
| 268 |
+
4. osint-env leaderboard --top 5
|
| 269 |
+
|
| 270 |
+
## 14. Scope Boundaries
|
| 271 |
+
|
| 272 |
+
- This repository supports a low-width swarm baseline and reward-compatible orchestration.
|
| 273 |
+
- It does not include a full distributed training stack or asynchronous external worker runtime.
|
| 274 |
+
- The architecture keeps those extensions possible without breaking current interfaces.
|
artifacts/leaderboard.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"config": {
|
| 4 |
+
"max_agents": 3,
|
| 5 |
+
"max_breadth": 2,
|
| 6 |
+
"max_depth": 2,
|
| 7 |
+
"max_steps": 18,
|
| 8 |
+
"max_width": 2,
|
| 9 |
+
"seed": 7,
|
| 10 |
+
"seeded_questions": 1,
|
| 11 |
+
"swarm_enabled": true
|
| 12 |
+
},
|
| 13 |
+
"created_at": "2026-04-01T12:03:13+00:00",
|
| 14 |
+
"episodes": 2,
|
| 15 |
+
"metrics": {
|
| 16 |
+
"avg_compactness_reward": 0.0,
|
| 17 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 18 |
+
"avg_connectivity_reward": 0.3,
|
| 19 |
+
"avg_diversity_reward": 0.08,
|
| 20 |
+
"avg_entity_informativeness_reward": 0.024705877237863647,
|
| 21 |
+
"avg_format_reward": 0.15,
|
| 22 |
+
"avg_graph_f1": 1.0,
|
| 23 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 24 |
+
"avg_knowledge_indexing_reward": 0.15,
|
| 25 |
+
"avg_relation_informativeness_reward": 0.03137141693971891,
|
| 26 |
+
"avg_reward": 3.534162700533434,
|
| 27 |
+
"avg_soft_shaping_reward": 0.15,
|
| 28 |
+
"avg_spawn_count": 4.0,
|
| 29 |
+
"avg_spawn_critical_steps": 6.0,
|
| 30 |
+
"avg_steps_to_solution": 9.0,
|
| 31 |
+
"deanonymization_accuracy": 1.0,
|
| 32 |
+
"leaderboard_score": 0.8618382743087459,
|
| 33 |
+
"retrieval_signal": 0.7275,
|
| 34 |
+
"spawn_completion_rate": 1.0,
|
| 35 |
+
"spawn_signal": 0.6666666666666666,
|
| 36 |
+
"structural_signal": 0.6082154588355165,
|
| 37 |
+
"task_success_rate": 1.0,
|
| 38 |
+
"tool_efficiency": 0.25
|
| 39 |
+
},
|
| 40 |
+
"run_id": "run_0001",
|
| 41 |
+
"run_name": "swarm_seed_smoke"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"config": {
|
| 45 |
+
"max_agents": 3,
|
| 46 |
+
"max_breadth": 2,
|
| 47 |
+
"max_depth": 2,
|
| 48 |
+
"max_steps": 18,
|
| 49 |
+
"max_width": 2,
|
| 50 |
+
"seed": 7,
|
| 51 |
+
"seeded_questions": 1,
|
| 52 |
+
"swarm_enabled": true
|
| 53 |
+
},
|
| 54 |
+
"created_at": "2026-04-01T12:16:28+00:00",
|
| 55 |
+
"episodes": 2,
|
| 56 |
+
"metrics": {
|
| 57 |
+
"avg_compactness_reward": 0.0,
|
| 58 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 59 |
+
"avg_connectivity_reward": 0.3,
|
| 60 |
+
"avg_diversity_reward": 0.08,
|
| 61 |
+
"avg_entity_informativeness_reward": 0.024705877237863647,
|
| 62 |
+
"avg_format_reward": 0.15,
|
| 63 |
+
"avg_graph_f1": 1.0,
|
| 64 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 65 |
+
"avg_knowledge_indexing_reward": 0.15,
|
| 66 |
+
"avg_relation_informativeness_reward": 0.03137141693971891,
|
| 67 |
+
"avg_reward": 3.534162700533434,
|
| 68 |
+
"avg_soft_shaping_reward": 0.15,
|
| 69 |
+
"avg_spawn_count": 4.0,
|
| 70 |
+
"avg_spawn_critical_steps": 6.0,
|
| 71 |
+
"avg_steps_to_solution": 9.0,
|
| 72 |
+
"deanonymization_accuracy": 1.0,
|
| 73 |
+
"leaderboard_score": 0.8618382743087459,
|
| 74 |
+
"retrieval_signal": 0.7275,
|
| 75 |
+
"spawn_completion_rate": 1.0,
|
| 76 |
+
"spawn_signal": 0.6666666666666666,
|
| 77 |
+
"structural_signal": 0.6082154588355165,
|
| 78 |
+
"task_success_rate": 1.0,
|
| 79 |
+
"tool_efficiency": 0.25
|
| 80 |
+
},
|
| 81 |
+
"run_id": "run_0002",
|
| 82 |
+
"run_name": "swarm_seed_smoke"
|
| 83 |
+
}
|
| 84 |
+
]
|
artifacts/osint_dashboard.html
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 6 |
+
<title>OSINT Environment Dashboard</title>
|
| 7 |
+
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
| 8 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
| 9 |
+
<link href="https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;600&display=swap" rel="stylesheet" />
|
| 10 |
+
<link href="https://unpkg.com/vis-network@9.1.9/styles/vis-network.min.css" rel="stylesheet" />
|
| 11 |
+
<script src="https://unpkg.com/vis-network@9.1.9/standalone/umd/vis-network.min.js"></script>
|
| 12 |
+
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.3/dist/chart.umd.min.js"></script>
|
| 13 |
+
<style>
|
| 14 |
+
:root {
|
| 15 |
+
--ink: #1d232f;
|
| 16 |
+
--muted: #5f6d7a;
|
| 17 |
+
--line: #d5dfe8;
|
| 18 |
+
--bg: #f5f8fb;
|
| 19 |
+
--card: #ffffff;
|
| 20 |
+
--brand: #0f766e;
|
| 21 |
+
--brand-soft: #d4f4ef;
|
| 22 |
+
--accent: #d97706;
|
| 23 |
+
--accent-soft: #ffe7c2;
|
| 24 |
+
--ok: #15803d;
|
| 25 |
+
--danger: #b91c1c;
|
| 26 |
+
}
|
| 27 |
+
* { box-sizing: border-box; }
|
| 28 |
+
body {
|
| 29 |
+
margin: 0;
|
| 30 |
+
color: var(--ink);
|
| 31 |
+
font-family: "Space Grotesk", "Segoe UI", sans-serif;
|
| 32 |
+
background:
|
| 33 |
+
radial-gradient(1200px 500px at -5% -20%, #d8efe9, transparent 70%),
|
| 34 |
+
radial-gradient(900px 500px at 110% -10%, #ffe9cf, transparent 65%),
|
| 35 |
+
var(--bg);
|
| 36 |
+
}
|
| 37 |
+
.wrap { max-width: 1500px; margin: 0 auto; padding: 20px; }
|
| 38 |
+
.card {
|
| 39 |
+
background: var(--card);
|
| 40 |
+
border: 1px solid var(--line);
|
| 41 |
+
border-radius: 18px;
|
| 42 |
+
padding: 16px;
|
| 43 |
+
box-shadow: 0 10px 24px rgba(24, 39, 59, 0.06);
|
| 44 |
+
}
|
| 45 |
+
.hero {
|
| 46 |
+
display: grid;
|
| 47 |
+
grid-template-columns: 2.1fr 1fr;
|
| 48 |
+
gap: 14px;
|
| 49 |
+
margin-bottom: 14px;
|
| 50 |
+
}
|
| 51 |
+
.hero-main {
|
| 52 |
+
background: linear-gradient(145deg, #f7fffd, #fff8ef);
|
| 53 |
+
border: 1px solid #e6efe8;
|
| 54 |
+
}
|
| 55 |
+
h1 { margin: 0 0 8px; font-size: 30px; letter-spacing: -0.02em; }
|
| 56 |
+
h2 { margin: 0 0 10px; font-size: 18px; letter-spacing: -0.01em; }
|
| 57 |
+
.muted { color: var(--muted); }
|
| 58 |
+
.pill-row { display: flex; gap: 8px; flex-wrap: wrap; margin-top: 8px; }
|
| 59 |
+
.pill {
|
| 60 |
+
border: 1px solid #dce8e6;
|
| 61 |
+
background: #fbfffe;
|
| 62 |
+
border-radius: 999px;
|
| 63 |
+
padding: 4px 10px;
|
| 64 |
+
font-size: 12px;
|
| 65 |
+
color: #214742;
|
| 66 |
+
}
|
| 67 |
+
.stats { display: grid; grid-template-columns: repeat(3, minmax(120px, 1fr)); gap: 10px; margin-top: 10px; }
|
| 68 |
+
.stat {
|
| 69 |
+
border: 1px dashed #cde2df;
|
| 70 |
+
background: linear-gradient(180deg, #fcfffe, #f6fffc);
|
| 71 |
+
border-radius: 12px;
|
| 72 |
+
padding: 10px;
|
| 73 |
+
}
|
| 74 |
+
.stat .k { font-size: 11px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.06em; }
|
| 75 |
+
.stat .v { font-size: 22px; font-weight: 700; }
|
| 76 |
+
.layout { display: grid; grid-template-columns: 1.2fr 3fr 1.2fr; gap: 14px; margin-bottom: 14px; }
|
| 77 |
+
.control-col { display: flex; flex-direction: column; gap: 14px; }
|
| 78 |
+
.control-grid { display: grid; gap: 8px; }
|
| 79 |
+
.graph-wrap { position: relative; overflow: hidden; }
|
| 80 |
+
.graph { height: 540px; border: 1px solid var(--line); border-radius: 14px; background: #fbfdff; }
|
| 81 |
+
.graph-banner {
|
| 82 |
+
position: absolute;
|
| 83 |
+
top: 10px;
|
| 84 |
+
left: 10px;
|
| 85 |
+
background: rgba(255,255,255,0.93);
|
| 86 |
+
border: 1px solid var(--line);
|
| 87 |
+
border-radius: 12px;
|
| 88 |
+
padding: 6px 10px;
|
| 89 |
+
font-size: 12px;
|
| 90 |
+
z-index: 2;
|
| 91 |
+
backdrop-filter: blur(4px);
|
| 92 |
+
}
|
| 93 |
+
.legend { display: flex; gap: 8px; flex-wrap: wrap; margin-top: 8px; font-size: 12px; }
|
| 94 |
+
.dot { width: 9px; height: 9px; border-radius: 999px; display: inline-block; margin-right: 4px; }
|
| 95 |
+
.mono { font-family: "IBM Plex Mono", monospace; font-size: 12px; }
|
| 96 |
+
.inline { display: flex; gap: 8px; align-items: center; }
|
| 97 |
+
.split { display: grid; grid-template-columns: 2fr 1.3fr; gap: 14px; margin-bottom: 14px; }
|
| 98 |
+
.db-tabs { display: flex; gap: 6px; flex-wrap: wrap; margin-bottom: 8px; }
|
| 99 |
+
.tab {
|
| 100 |
+
border: 1px solid var(--line);
|
| 101 |
+
border-radius: 9px;
|
| 102 |
+
padding: 5px 10px;
|
| 103 |
+
background: #fff;
|
| 104 |
+
cursor: pointer;
|
| 105 |
+
font-size: 12px;
|
| 106 |
+
}
|
| 107 |
+
.tab.active { background: var(--brand-soft); border-color: #b5e7de; color: #08554e; }
|
| 108 |
+
.table-wrap { max-height: 320px; overflow: auto; border: 1px solid var(--line); border-radius: 12px; }
|
| 109 |
+
table { width: 100%; border-collapse: collapse; font-size: 12.5px; }
|
| 110 |
+
th, td { padding: 8px; border-bottom: 1px solid #edf2f7; text-align: left; vertical-align: top; }
|
| 111 |
+
th { position: sticky; top: 0; background: #f7fbff; z-index: 1; }
|
| 112 |
+
tr:hover td { background: #f9fcff; }
|
| 113 |
+
.json-view {
|
| 114 |
+
height: 320px;
|
| 115 |
+
overflow: auto;
|
| 116 |
+
border: 1px solid var(--line);
|
| 117 |
+
border-radius: 12px;
|
| 118 |
+
background: #0f172a;
|
| 119 |
+
color: #d2f8ee;
|
| 120 |
+
padding: 10px;
|
| 121 |
+
margin: 0;
|
| 122 |
+
}
|
| 123 |
+
.charts { display: grid; grid-template-columns: 1fr 1fr; gap: 14px; margin-bottom: 14px; }
|
| 124 |
+
.chart-box { height: 300px; }
|
| 125 |
+
select, input[type="search"], button {
|
| 126 |
+
border: 1px solid var(--line);
|
| 127 |
+
border-radius: 9px;
|
| 128 |
+
padding: 8px;
|
| 129 |
+
font: inherit;
|
| 130 |
+
background: #fff;
|
| 131 |
+
color: var(--ink);
|
| 132 |
+
}
|
| 133 |
+
button { cursor: pointer; background: #fff; }
|
| 134 |
+
button.primary { background: var(--brand); border-color: #0e6f68; color: #fff; }
|
| 135 |
+
.subtle { background: #f7fafc; }
|
| 136 |
+
@media (max-width: 1100px) {
|
| 137 |
+
.hero, .layout, .split, .charts { grid-template-columns: 1fr; }
|
| 138 |
+
.graph { height: 440px; }
|
| 139 |
+
}
|
| 140 |
+
</style>
|
| 141 |
+
</head>
|
| 142 |
+
<body>
|
| 143 |
+
<div class="wrap">
|
| 144 |
+
<div class="hero">
|
| 145 |
+
<section class="card hero-main">
|
| 146 |
+
<h1>OSINT Benchmark Dashboard</h1>
|
| 147 |
+
<p class="muted">Interactive explorer for canonical knowledge graph, episode traces, source platform records, and benchmark ranking.</p>
|
| 148 |
+
<div class="pill-row" id="hero-pills"></div>
|
| 149 |
+
<div class="stats" id="stats"></div>
|
| 150 |
+
</section>
|
| 151 |
+
<section class="card">
|
| 152 |
+
<h2>Latest Task Snapshot</h2>
|
| 153 |
+
<div><strong>Task ID:</strong> <span id="task-id"></span></div>
|
| 154 |
+
<div><strong>Task Type:</strong> <span id="task-type"></span></div>
|
| 155 |
+
<div style="margin-top:8px"><strong>Question</strong></div>
|
| 156 |
+
<div id="task-question" class="muted"></div>
|
| 157 |
+
<div style="margin-top:8px"><strong>Answer</strong>: <span id="task-answer"></span></div>
|
| 158 |
+
</section>
|
| 159 |
+
</div>
|
| 160 |
+
|
| 161 |
+
<div class="layout">
|
| 162 |
+
<section class="card control-col">
|
| 163 |
+
<div>
|
| 164 |
+
<h2>Graph Controls</h2>
|
| 165 |
+
<div class="control-grid">
|
| 166 |
+
<label class="mono" for="graph-mode">Graph Layer</label>
|
| 167 |
+
<select id="graph-mode">
|
| 168 |
+
<option value="canonical">Canonical Graph</option>
|
| 169 |
+
<option value="episode">Episode Graph</option>
|
| 170 |
+
</select>
|
| 171 |
+
<label class="mono" for="graph-search">Node Search</label>
|
| 172 |
+
<input id="graph-search" type="search" placeholder="Type node id or label..." />
|
| 173 |
+
<label class="mono" for="relation-filter">Relation Filter</label>
|
| 174 |
+
<input id="relation-filter" type="search" placeholder="Filter edge labels..." />
|
| 175 |
+
<button id="fit-graph" class="primary">Fit Graph</button>
|
| 176 |
+
</div>
|
| 177 |
+
</div>
|
| 178 |
+
<div>
|
| 179 |
+
<h2>Node Types</h2>
|
| 180 |
+
<div id="type-filters" class="control-grid mono"></div>
|
| 181 |
+
</div>
|
| 182 |
+
</section>
|
| 183 |
+
|
| 184 |
+
<section class="card">
|
| 185 |
+
<h2>Graph Explorer</h2>
|
| 186 |
+
<div class="graph-wrap">
|
| 187 |
+
<div class="graph-banner" id="graph-banner">Layer: Canonical Graph</div>
|
| 188 |
+
<div id="graph-canvas" class="graph"></div>
|
| 189 |
+
</div>
|
| 190 |
+
<div class="legend">
|
| 191 |
+
<span><span class="dot" style="background:#16a34a"></span>matched edge</span>
|
| 192 |
+
<span><span class="dot" style="background:#2563eb"></span>predicted only</span>
|
| 193 |
+
<span><span class="dot" style="background:#f59e0b"></span>truth only</span>
|
| 194 |
+
</div>
|
| 195 |
+
</section>
|
| 196 |
+
|
| 197 |
+
<section class="card control-col">
|
| 198 |
+
<div>
|
| 199 |
+
<h2>Node Inspector</h2>
|
| 200 |
+
<pre id="node-detail" class="json-view">Click a node to inspect attributes and neighbors.</pre>
|
| 201 |
+
</div>
|
| 202 |
+
<div>
|
| 203 |
+
<h2>Edge Inspector</h2>
|
| 204 |
+
<pre id="edge-detail" class="json-view">Click an edge to inspect relation details.</pre>
|
| 205 |
+
</div>
|
| 206 |
+
</section>
|
| 207 |
+
</div>
|
| 208 |
+
|
| 209 |
+
<div class="split">
|
| 210 |
+
<section class="card">
|
| 211 |
+
<h2>Original Database Explorer</h2>
|
| 212 |
+
<div class="db-tabs" id="db-tabs"></div>
|
| 213 |
+
<div class="inline" style="margin-bottom:8px">
|
| 214 |
+
<input id="db-search" type="search" placeholder="Search records..." style="flex:1" />
|
| 215 |
+
<select id="db-limit">
|
| 216 |
+
<option value="200">200</option>
|
| 217 |
+
<option value="500">500</option>
|
| 218 |
+
<option value="1000">1000</option>
|
| 219 |
+
</select>
|
| 220 |
+
</div>
|
| 221 |
+
<div class="table-wrap"><table id="db-table"></table></div>
|
| 222 |
+
</section>
|
| 223 |
+
|
| 224 |
+
<section class="card">
|
| 225 |
+
<h2>Selected Source Record</h2>
|
| 226 |
+
<pre id="db-detail" class="json-view">Click a row in the database table to inspect full JSON.</pre>
|
| 227 |
+
</section>
|
| 228 |
+
</div>
|
| 229 |
+
|
| 230 |
+
<div class="charts">
|
| 231 |
+
<section class="card">
|
| 232 |
+
<h2>Benchmark Summary Radar</h2>
|
| 233 |
+
<div class="chart-box"><canvas id="summary-chart"></canvas></div>
|
| 234 |
+
</section>
|
| 235 |
+
<section class="card">
|
| 236 |
+
<h2>Episode Reward and Graph F1</h2>
|
| 237 |
+
<div class="chart-box"><canvas id="trace-chart"></canvas></div>
|
| 238 |
+
</section>
|
| 239 |
+
</div>
|
| 240 |
+
|
| 241 |
+
<section class="card">
|
| 242 |
+
<h2>Benchmark Leaderboard</h2>
|
| 243 |
+
<div class="inline" style="margin-bottom:8px">
|
| 244 |
+
<label class="mono" for="leader-sort">Sort by</label>
|
| 245 |
+
<select id="leader-sort" class="subtle">
|
| 246 |
+
<option value="leaderboard_score">leaderboard_score</option>
|
| 247 |
+
<option value="task_success_rate">task_success_rate</option>
|
| 248 |
+
<option value="avg_graph_f1">avg_graph_f1</option>
|
| 249 |
+
<option value="retrieval_signal">retrieval_signal</option>
|
| 250 |
+
<option value="structural_signal">structural_signal</option>
|
| 251 |
+
<option value="spawn_signal">spawn_signal</option>
|
| 252 |
+
<option value="avg_reward">avg_reward</option>
|
| 253 |
+
</select>
|
| 254 |
+
</div>
|
| 255 |
+
<div class="table-wrap"><table id="leaderboard-table"></table></div>
|
| 256 |
+
</section>
|
| 257 |
+
</div>
|
| 258 |
+
|
| 259 |
+
<script>
|
| 260 |
+
const payload = {"summary": {"task_success_rate": 1.0, "tool_efficiency": 0.25, "avg_graph_f1": 1.0, "avg_steps_to_solution": 9.0, "deanonymization_accuracy": 1.0, "avg_reward": 3.534162700533434, "avg_knowledge_carrier_reward": 0.5, "avg_knowledge_indexing_reward": 0.15, "avg_connectivity_reward": 0.3, "avg_format_reward": 0.15, "avg_relation_informativeness_reward": 0.03137141693971891, "avg_entity_informativeness_reward": 0.024705877237863647, "avg_diversity_reward": 0.08, "avg_soft_shaping_reward": 0.15, "avg_connectivity_gain_reward": 0.1, "avg_compactness_reward": 0.0, "avg_spawn_count": 4.0, "spawn_completion_rate": 1.0, "avg_spawn_critical_steps": 6.0, "spawn_signal": 0.6666666666666666, "retrieval_signal": 0.7275, "structural_signal": 0.6082154588355165, "leaderboard_score": 0.8618382743087459}, "episodes": [{"task_id": "seed_task_0", "task_type": "identity_resolution", "graph_f1": 1.0, "reward": 3.279727292219666, "steps": 9, "tool_calls": 4, "success": 1, "reward_components": {"tool_novelty": -0.55, "tool_relevance": 0.0, "total": 3.951641885863814, "global_accuracy": 0.85, "soft_shaping": 0.15, "efficiency": 0.06333333333333334, "diversity": 0.08, "relation_informativeness": 0.03137141693971891, "entity_informativeness": 0.026937135590762374, "connectivity_gain": 0.1, "duplicate_edge_penalty": -0.44999999999999996, "format_reward": 0.15, "correctness": 1.15, "knowledge_carrier": 0.5, "knowledge_indexing": 0.0, "connectivity": 0.3, "graph_f1": 0.55, "compactness": 0.0, "repetition_penalty": 0.0, "spawn_auxiliary": 0.32808540635585226, "spawn_count": 4.0, "spawn_finished_subtasks": 4.0, "spawn_critical_steps": 6.0, "spawn_depth": 2.0, "spawn_breadth": 2.0}, "spawn_count": 4, "spawn_critical_steps": 6}, {"task_id": "task_1", "task_type": "identity_resolution", "graph_f1": 1.0, "reward": 3.788598108847202, "steps": 9, "tool_calls": 4, "success": 1, "reward_components": {"tool_novelty": -0.55, "tool_relevance": 0.21333333333333332, "total": 4.247179369158016, "global_accuracy": 0.85, "soft_shaping": 0.15, "efficiency": 0.06333333333333334, "diversity": 0.08, "relation_informativeness": 0.03137141693971891, "entity_informativeness": 0.02247461888496492, "connectivity_gain": 0.1, "duplicate_edge_penalty": -0.44999999999999996, "format_reward": 0.15, "correctness": 1.15, "knowledge_carrier": 0.5, "knowledge_indexing": 0.3, "connectivity": 0.3, "graph_f1": 0.55, "compactness": 0.0, "repetition_penalty": 0.0, "spawn_auxiliary": 0.32808540635585226, "spawn_count": 4.0, "spawn_finished_subtasks": 4.0, "spawn_critical_steps": 6.0, "spawn_depth": 2.0, "spawn_breadth": 2.0}, "spawn_count": 4, "spawn_critical_steps": 6}], "leaderboard": [{"config": {"max_agents": 3, "max_breadth": 2, "max_depth": 2, "max_steps": 18, "max_width": 2, "seed": 7, "seeded_questions": 1, "swarm_enabled": true}, "created_at": "2026-04-01T12:03:13+00:00", "episodes": 2, "metrics": {"avg_compactness_reward": 0.0, "avg_connectivity_gain_reward": 0.1, "avg_connectivity_reward": 0.3, "avg_diversity_reward": 0.08, "avg_entity_informativeness_reward": 0.024705877237863647, "avg_format_reward": 0.15, "avg_graph_f1": 1.0, "avg_knowledge_carrier_reward": 0.5, "avg_knowledge_indexing_reward": 0.15, "avg_relation_informativeness_reward": 0.03137141693971891, "avg_reward": 3.534162700533434, "avg_soft_shaping_reward": 0.15, "avg_spawn_count": 4.0, "avg_spawn_critical_steps": 6.0, "avg_steps_to_solution": 9.0, "deanonymization_accuracy": 1.0, "leaderboard_score": 0.8618382743087459, "retrieval_signal": 0.7275, "spawn_completion_rate": 1.0, "spawn_signal": 0.6666666666666666, "structural_signal": 0.6082154588355165, "task_success_rate": 1.0, "tool_efficiency": 0.25}, "run_id": "run_0001", "run_name": "swarm_seed_smoke"}, {"config": {"max_agents": 3, "max_breadth": 2, "max_depth": 2, "max_steps": 18, "max_width": 2, "seed": 7, "seeded_questions": 1, "swarm_enabled": true}, "created_at": "2026-04-01T12:16:28+00:00", "episodes": 2, "metrics": {"avg_compactness_reward": 0.0, "avg_connectivity_gain_reward": 0.1, "avg_connectivity_reward": 0.3, "avg_diversity_reward": 0.08, "avg_entity_informativeness_reward": 0.024705877237863647, "avg_format_reward": 0.15, "avg_graph_f1": 1.0, "avg_knowledge_carrier_reward": 0.5, "avg_knowledge_indexing_reward": 0.15, "avg_relation_informativeness_reward": 0.03137141693971891, "avg_reward": 3.534162700533434, "avg_soft_shaping_reward": 0.15, "avg_spawn_count": 4.0, "avg_spawn_critical_steps": 6.0, "avg_steps_to_solution": 9.0, "deanonymization_accuracy": 1.0, "leaderboard_score": 0.8618382743087459, "retrieval_signal": 0.7275, "spawn_completion_rate": 1.0, "spawn_signal": 0.6666666666666666, "structural_signal": 0.6082154588355165, "task_success_rate": 1.0, "tool_efficiency": 0.25}, "run_id": "run_0002", "run_name": "swarm_seed_smoke"}], "canonical_graph": {"nodes": [{"id": "user_0", "label": "Person 0", "group": "user", "title": "name: Person 0\\norg: Helios Labs\\nlocation: Pune", "attrs": {"name": "Person 0", "org": "Helios Labs", "location": "Pune"}}, {"id": "org_helios_labs", "label": "Helios Labs", "group": "org", "title": "name: Helios Labs", "attrs": {"name": "Helios Labs"}}, {"id": "loc_pune", "label": "Pune", "group": "location", "title": "name: Pune", "attrs": {"name": "Pune"}}, {"id": "user_1", "label": "Person 1", "group": "user", "title": "name: Person 1\\norg: Apex Dynamics\\nlocation: Bengaluru", "attrs": {"name": "Person 1", "org": "Apex Dynamics", "location": "Bengaluru"}}, {"id": "org_apex_dynamics", "label": "Apex Dynamics", "group": "org", "title": "name: Apex Dynamics", "attrs": {"name": "Apex Dynamics"}}, {"id": "loc_bengaluru", "label": "Bengaluru", "group": "location", "title": "name: Bengaluru", "attrs": {"name": "Bengaluru"}}, {"id": "user_2", "label": "Person 2", "group": "user", "title": "name: Person 2\\norg: Apex Dynamics\\nlocation: Hyderabad", "attrs": {"name": "Person 2", "org": "Apex Dynamics", "location": "Hyderabad"}}, {"id": "loc_hyderabad", "label": "Hyderabad", "group": "location", "title": "name: Hyderabad", "attrs": {"name": "Hyderabad"}}, {"id": "user_3", "label": "Person 3", "group": "user", "title": "name: Person 3\\norg: Northbridge\\nlocation: Pune", "attrs": {"name": "Person 3", "org": "Northbridge", "location": "Pune"}}, {"id": "org_northbridge", "label": "Northbridge", "group": "org", "title": "name: Northbridge", "attrs": {"name": "Northbridge"}}, {"id": "alias_3_544", "label": "@alias_3_544", "group": "alias", "title": "handle: @alias_3_544", "attrs": {"handle": "@alias_3_544"}}, {"id": "user_4", "label": "Person 4", "group": "user", "title": "name: Person 4\\norg: Helios Labs\\nlocation: Bengaluru", "attrs": {"name": "Person 4", "org": "Helios Labs", "location": "Bengaluru"}}, {"id": "alias_4_664", "label": "@alias_4_664", "group": "alias", "title": "handle: @alias_4_664", "attrs": {"handle": "@alias_4_664"}}, {"id": "user_5", "label": "Person 5", "group": "user", "title": "name: Person 5\\norg: Helios Labs\\nlocation: Bengaluru", "attrs": {"name": "Person 5", "org": "Helios Labs", "location": "Bengaluru"}}, {"id": "user_6", "label": "Person 6", "group": "user", "title": "name: Person 6\\norg: Apex Dynamics\\nlocation: Pune", "attrs": {"name": "Person 6", "org": "Apex Dynamics", "location": "Pune"}}, {"id": "user_7", "label": "Person 7", "group": "user", "title": "name: Person 7\\norg: Northbridge\\nlocation: Bengaluru", "attrs": {"name": "Person 7", "org": "Northbridge", "location": "Bengaluru"}}, {"id": "user_8", "label": "Person 8", "group": "user", "title": "name: Person 8\\norg: Helios Labs\\nlocation: Bengaluru", "attrs": {"name": "Person 8", "org": "Helios Labs", "location": "Bengaluru"}}, {"id": "user_9", "label": "Person 9", "group": "user", "title": "name: Person 9\\norg: Apex Dynamics\\nlocation: Pune", "attrs": {"name": "Person 9", "org": "Apex Dynamics", "location": "Pune"}}, {"id": "alias_9_247", "label": "@alias_9_247", "group": "alias", "title": "handle: @alias_9_247", "attrs": {"handle": "@alias_9_247"}}, {"id": "user_10", "label": "Person 10", "group": "user", "title": "name: Person 10\\norg: Northbridge\\nlocation: Bengaluru", "attrs": {"name": "Person 10", "org": "Northbridge", "location": "Bengaluru"}}, {"id": "user_11", "label": "Person 11", "group": "user", "title": "name: Person 11\\norg: Northbridge\\nlocation: Pune", "attrs": {"name": "Person 11", "org": "Northbridge", "location": "Pune"}}, {"id": "alias_11_684", "label": "@alias_11_684", "group": "alias", "title": "handle: @alias_11_684", "attrs": {"handle": "@alias_11_684"}}, {"id": "user_12", "label": "Person 12", "group": "user", "title": "name: Person 12\\norg: Northbridge\\nlocation: Pune", "attrs": {"name": "Person 12", "org": "Northbridge", "location": "Pune"}}, {"id": "user_13", "label": "Person 13", "group": "user", "title": "name: Person 13\\norg: Northbridge\\nlocation: Bengaluru", "attrs": {"name": "Person 13", "org": "Northbridge", "location": "Bengaluru"}}, {"id": "user_14", "label": "Person 14", "group": "user", "title": "name: Person 14\\norg: Northbridge\\nlocation: Pune", "attrs": {"name": "Person 14", "org": "Northbridge", "location": "Pune"}}, {"id": "user_15", "label": "Person 15", "group": "user", "title": "name: Person 15\\norg: Northbridge\\nlocation: Delhi", "attrs": {"name": "Person 15", "org": "Northbridge", "location": "Delhi"}}, {"id": "loc_delhi", "label": "Delhi", "group": "location", "title": "name: Delhi", "attrs": {"name": "Delhi"}}, {"id": "user_16", "label": "Person 16", "group": "user", "title": "name: Person 16\\norg: Helios Labs\\nlocation: Delhi", "attrs": {"name": "Person 16", "org": "Helios Labs", "location": "Delhi"}}, {"id": "user_17", "label": "Person 17", "group": "user", "title": "name: Person 17\\norg: Apex Dynamics\\nlocation: Pune", "attrs": {"name": "Person 17", "org": "Apex Dynamics", "location": "Pune"}}, {"id": "user_18", "label": "Person 18", "group": "user", "title": "name: Person 18\\norg: Apex Dynamics\\nlocation: Bengaluru", "attrs": {"name": "Person 18", "org": "Apex Dynamics", "location": "Bengaluru"}}, {"id": "user_19", "label": "Person 19", "group": "user", "title": "name: Person 19\\norg: Northbridge\\nlocation: Delhi", "attrs": {"name": "Person 19", "org": "Northbridge", "location": "Delhi"}}, {"id": "user_20", "label": "Person 20", "group": "user", "title": "name: Person 20\\norg: Northbridge\\nlocation: Delhi", "attrs": {"name": "Person 20", "org": "Northbridge", "location": "Delhi"}}, {"id": "alias_20_174", "label": "@alias_20_174", "group": "alias", "title": "handle: @alias_20_174", "attrs": {"handle": "@alias_20_174"}}, {"id": "user_21", "label": "Person 21", "group": "user", "title": "name: Person 21\\norg: Apex Dynamics\\nlocation: Delhi", "attrs": {"name": "Person 21", "org": "Apex Dynamics", "location": "Delhi"}}, {"id": "alias_21_450", "label": "@alias_21_450", "group": "alias", "title": "handle: @alias_21_450", "attrs": {"handle": "@alias_21_450"}}, {"id": "user_22", "label": "Person 22", "group": "user", "title": "name: Person 22\\norg: Apex Dynamics\\nlocation: Delhi", "attrs": {"name": "Person 22", "org": "Apex Dynamics", "location": "Delhi"}}, {"id": "user_23", "label": "Person 23", "group": "user", "title": "name: Person 23\\norg: Northbridge\\nlocation: Bengaluru", "attrs": {"name": "Person 23", "org": "Northbridge", "location": "Bengaluru"}}, {"id": "user_24", "label": "Person 24", "group": "user", "title": "name: Person 24\\norg: Northbridge\\nlocation: Hyderabad", "attrs": {"name": "Person 24", "org": "Northbridge", "location": "Hyderabad"}}, {"id": "alias_24_458", "label": "@alias_24_458", "group": "alias", "title": "handle: @alias_24_458", "attrs": {"handle": "@alias_24_458"}}, {"id": "user_25", "label": "Person 25", "group": "user", "title": "name: Person 25\\norg: Northbridge\\nlocation: Delhi", "attrs": {"name": "Person 25", "org": "Northbridge", "location": "Delhi"}}, {"id": "user_26", "label": "Person 26", "group": "user", "title": "name: Person 26\\norg: Helios Labs\\nlocation: Bengaluru", "attrs": {"name": "Person 26", "org": "Helios Labs", "location": "Bengaluru"}}, {"id": "user_27", "label": "Person 27", "group": "user", "title": "name: Person 27\\norg: Helios Labs\\nlocation: Delhi", "attrs": {"name": "Person 27", "org": "Helios Labs", "location": "Delhi"}}, {"id": "user_28", "label": "Person 28", "group": "user", "title": "name: Person 28\\norg: Apex Dynamics\\nlocation: Bengaluru", "attrs": {"name": "Person 28", "org": "Apex Dynamics", "location": "Bengaluru"}}, {"id": "user_29", "label": "Person 29", "group": "user", "title": "name: Person 29\\norg: Helios Labs\\nlocation: Delhi", "attrs": {"name": "Person 29", "org": "Helios Labs", "location": "Delhi"}}, {"id": "alias_29_495", "label": "@alias_29_495", "group": "alias", "title": "handle: @alias_29_495", "attrs": {"handle": "@alias_29_495"}}, {"id": "user_30", "label": "Person 30", "group": "user", "title": "name: Person 30\\norg: Northbridge\\nlocation: Hyderabad", "attrs": {"name": "Person 30", "org": "Northbridge", "location": "Hyderabad"}}, {"id": "alias_30_572", "label": "@alias_30_572", "group": "alias", "title": "handle: @alias_30_572", "attrs": {"handle": "@alias_30_572"}}, {"id": "user_31", "label": "Person 31", "group": "user", "title": "name: Person 31\\norg: Helios Labs\\nlocation: Pune", "attrs": {"name": "Person 31", "org": "Helios Labs", "location": "Pune"}}, {"id": "user_32", "label": "Person 32", "group": "user", "title": "name: Person 32\\norg: Helios Labs\\nlocation: Bengaluru", "attrs": {"name": "Person 32", "org": "Helios Labs", "location": "Bengaluru"}}, {"id": "alias_32_394", "label": "@alias_32_394", "group": "alias", "title": "handle: @alias_32_394", "attrs": {"handle": "@alias_32_394"}}, {"id": "user_33", "label": "Person 33", "group": "user", "title": "name: Person 33\\norg: Apex Dynamics\\nlocation: Pune", "attrs": {"name": "Person 33", "org": "Apex Dynamics", "location": "Pune"}}, {"id": "user_34", "label": "Person 34", "group": "user", "title": "name: Person 34\\norg: Helios Labs\\nlocation: Bengaluru", "attrs": {"name": "Person 34", "org": "Helios Labs", "location": "Bengaluru"}}, {"id": "alias_34_511", "label": "@alias_34_511", "group": "alias", "title": "handle: @alias_34_511", "attrs": {"handle": "@alias_34_511"}}, {"id": "user_35", "label": "Person 35", "group": "user", "title": "name: Person 35\\norg: Northbridge\\nlocation: Hyderabad", "attrs": {"name": "Person 35", "org": "Northbridge", "location": "Hyderabad"}}, {"id": "user_36", "label": "Person 36", "group": "user", "title": "name: Person 36\\norg: Helios Labs\\nlocation: Hyderabad", "attrs": {"name": "Person 36", "org": "Helios Labs", "location": "Hyderabad"}}, {"id": "user_37", "label": "Person 37", "group": "user", "title": "name: Person 37\\norg: Helios Labs\\nlocation: Delhi", "attrs": {"name": "Person 37", "org": "Helios Labs", "location": "Delhi"}}, {"id": "user_38", "label": "Person 38", "group": "user", "title": "name: Person 38\\norg: Apex Dynamics\\nlocation: Bengaluru", "attrs": {"name": "Person 38", "org": "Apex Dynamics", "location": "Bengaluru"}}, {"id": "alias_38_337", "label": "@alias_38_337", "group": "alias", "title": "handle: @alias_38_337", "attrs": {"handle": "@alias_38_337"}}, {"id": "user_39", "label": "Person 39", "group": "user", "title": "name: Person 39\\norg: Northbridge\\nlocation: Pune", "attrs": {"name": "Person 39", "org": "Northbridge", "location": "Pune"}}, {"id": "alias_39_951", "label": "@alias_39_951", "group": "alias", "title": "handle: @alias_39_951", "attrs": {"handle": "@alias_39_951"}}, {"id": "alias_seed_001", "label": "@shadow_seed", "group": "alias", "title": "handle: @shadow_seed", "attrs": {"handle": "@shadow_seed"}}, {"id": "user_seed_001", "label": "Seed User", "group": "user", "title": "name: Seed User\\norg: Helios Labs\\nlocation: Pune", "attrs": {"name": "Seed User", "org": "Helios Labs", "location": "Pune"}}], "edges": [{"id": "c_0", "from": "user_0", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_1", "from": "user_0", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_2", "from": "user_1", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_3", "from": "user_1", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_4", "from": "user_2", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_5", "from": "user_2", "to": "loc_hyderabad", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_6", "from": "user_3", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_7", "from": "user_3", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_8", "from": "alias_3_544", "to": "user_3", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_9", "from": "user_4", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_10", "from": "user_4", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_11", "from": "alias_4_664", "to": "user_4", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_12", "from": "user_5", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_13", "from": "user_5", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_14", "from": "user_6", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_15", "from": "user_6", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_16", "from": "user_7", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_17", "from": "user_7", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_18", "from": "user_8", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_19", "from": "user_8", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_20", "from": "user_9", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_21", "from": "user_9", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_22", "from": "alias_9_247", "to": "user_9", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_23", "from": "user_10", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_24", "from": "user_10", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_25", "from": "user_11", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_26", "from": "user_11", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_27", "from": "alias_11_684", "to": "user_11", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_28", "from": "user_12", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_29", "from": "user_12", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_30", "from": "user_13", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_31", "from": "user_13", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_32", "from": "user_14", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_33", "from": "user_14", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_34", "from": "user_15", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_35", "from": "user_15", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_36", "from": "user_16", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_37", "from": "user_16", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_38", "from": "user_17", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_39", "from": "user_17", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_40", "from": "user_18", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_41", "from": "user_18", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_42", "from": "user_19", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_43", "from": "user_19", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_44", "from": "user_20", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_45", "from": "user_20", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_46", "from": "alias_20_174", "to": "user_20", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_47", "from": "user_21", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_48", "from": "user_21", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_49", "from": "alias_21_450", "to": "user_21", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_50", "from": "user_22", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_51", "from": "user_22", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_52", "from": "user_23", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_53", "from": "user_23", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_54", "from": "user_24", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_55", "from": "user_24", "to": "loc_hyderabad", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_56", "from": "alias_24_458", "to": "user_24", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_57", "from": "user_25", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_58", "from": "user_25", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_59", "from": "user_26", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_60", "from": "user_26", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_61", "from": "user_27", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_62", "from": "user_27", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_63", "from": "user_28", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_64", "from": "user_28", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_65", "from": "user_29", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_66", "from": "user_29", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_67", "from": "alias_29_495", "to": "user_29", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_68", "from": "user_30", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_69", "from": "user_30", "to": "loc_hyderabad", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_70", "from": "alias_30_572", "to": "user_30", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_71", "from": "user_31", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_72", "from": "user_31", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_73", "from": "user_32", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_74", "from": "user_32", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_75", "from": "alias_32_394", "to": "user_32", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_76", "from": "user_33", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_77", "from": "user_33", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_78", "from": "user_34", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_79", "from": "user_34", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_80", "from": "alias_34_511", "to": "user_34", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_81", "from": "user_35", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_82", "from": "user_35", "to": "loc_hyderabad", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_83", "from": "user_36", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_84", "from": "user_36", "to": "loc_hyderabad", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_85", "from": "user_37", "to": "org_helios_labs", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_86", "from": "user_37", "to": "loc_delhi", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_87", "from": "user_38", "to": "org_apex_dynamics", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_88", "from": "user_38", "to": "loc_bengaluru", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_89", "from": "alias_38_337", "to": "user_38", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_90", "from": "user_39", "to": "org_northbridge", "label": "works_at", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_91", "from": "user_39", "to": "loc_pune", "label": "located_in", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_92", "from": "alias_39_951", "to": "user_39", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_93", "from": "user_37", "to": "user_11", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_94", "from": "user_16", "to": "user_18", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_95", "from": "user_0", "to": "user_9", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_96", "from": "user_26", "to": "user_34", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_97", "from": "user_23", "to": "user_39", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_98", "from": "user_36", "to": "user_20", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_99", "from": "user_8", "to": "user_32", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_100", "from": "user_39", "to": "user_3", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_101", "from": "user_29", "to": "user_35", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_102", "from": "user_25", "to": "user_6", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_103", "from": "user_30", "to": "user_25", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_104", "from": "user_3", "to": "user_12", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_105", "from": "user_4", "to": "user_13", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_106", "from": "user_28", "to": "user_10", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_107", "from": "user_7", "to": "user_21", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_108", "from": "user_38", "to": "user_3", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_109", "from": "user_6", "to": "user_0", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_110", "from": "user_36", "to": "user_9", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_111", "from": "user_34", "to": "user_6", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_112", "from": "user_23", "to": "user_39", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.8, "status": "canonical"}, {"id": "c_113", "from": "alias_seed_001", "to": "user_seed_001", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 1.0, "status": "canonical"}, {"id": "c_114", "from": "alias_seed_001", "to": "user_13", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.7, "status": "canonical"}, {"id": "c_115", "from": "user_9", "to": "user_seed_001", "label": "mentions", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.7, "status": "canonical"}, {"id": "c_116", "from": "user_38", "to": "user_23", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.7, "status": "canonical"}, {"id": "c_117", "from": "user_7", "to": "user_31", "label": "mentions", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.7, "status": "canonical"}, {"id": "c_118", "from": "user_19", "to": "user_5", "label": "connected_to", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.7, "status": "canonical"}, {"id": "c_119", "from": "alias_21_450", "to": "user_16", "label": "alias_of", "arrows": "to", "color": "#1f2937", "width": 1, "confidence": 0.7, "status": "canonical"}]}, "episode_graph": {"nodes": [{"id": "alias_30_572", "label": "@alias_30_572", "group": "alias", "attrs": {"handle": "@alias_30_572"}}, {"id": "user_30", "label": "Person 30", "group": "user", "attrs": {"name": "Person 30", "org": "Northbridge", "location": "Hyderabad"}}], "edges": [{"id": "e_0", "from": "alias_30_572", "to": "user_30", "label": "alias_of", "arrows": "to", "color": "#16a34a", "dashes": false, "width": 2, "status": "matched", "confidence": 1.0}]}, "views": {"microblog_posts": [{"post_id": "post_0", "user_id": "user_0", "canonical_user": "user_0", "text": "Update 0 from Helios Labs #pune", "mentions": ["user_33"], "timestamp": 1000}, {"post_id": "post_1", "user_id": "alias_30_572", "canonical_user": "user_30", "text": "Update 1 from Apex Dynamics #bengaluru", "mentions": ["user_34"], "timestamp": 1001}, {"post_id": "post_2", "user_id": "user_2", "canonical_user": "user_2", "text": "Update 2 from Apex Dynamics #hyderabad", "mentions": ["user_19"], "timestamp": 1002}, {"post_id": "post_3", "user_id": "user_3", "canonical_user": "user_3", "text": "Update 3 from Northbridge #pune", "mentions": ["user_16"], "timestamp": 1003}, {"post_id": "post_4", "user_id": "user_4", "canonical_user": "user_4", "text": "Update 4 from Helios Labs #bengaluru", "mentions": ["user_22"], "timestamp": 1004}, {"post_id": "post_5", "user_id": "user_5", "canonical_user": "user_5", "text": "Update 5 from Helios Labs #bengaluru", "mentions": ["user_32"], "timestamp": 1005}, {"post_id": "post_6", "user_id": "alias_11_684", "canonical_user": "user_11", "text": "Update 6 from Apex Dynamics #pune", "mentions": ["user_12"], "timestamp": 1006}, {"post_id": "post_7", "user_id": "user_7", "canonical_user": "user_7", "text": "Update 7 from Northbridge #bengaluru", "mentions": ["user_14"], "timestamp": 1007}, {"post_id": "post_8", "user_id": "alias_29_495", "canonical_user": "user_29", "text": "Update 8 from Helios Labs #bengaluru", "mentions": ["user_1"], "timestamp": 1008}, {"post_id": "post_9", "user_id": "user_9", "canonical_user": "user_9", "text": "Update 9 from Apex Dynamics #pune", "mentions": ["user_30"], "timestamp": 1009}, {"post_id": "post_10", "user_id": "alias_38_337", "canonical_user": "user_38", "text": "Update 10 from Northbridge #bengaluru", "mentions": ["user_22"], "timestamp": 1010}, {"post_id": "post_11", "user_id": "alias_38_337", "canonical_user": "user_38", "text": "Update 11 from Northbridge #pune", "mentions": ["user_23"], "timestamp": 1011}, {"post_id": "post_12", "user_id": "alias_4_664", "canonical_user": "user_4", "text": "Update 12 from Northbridge #pune", "mentions": ["user_12"], "timestamp": 1012}, {"post_id": "post_13", "user_id": "alias_29_495", "canonical_user": "user_29", "text": "Update 13 from Northbridge #bengaluru", "mentions": ["user_39"], "timestamp": 1013}, {"post_id": "post_14", "user_id": "user_14", "canonical_user": "user_14", "text": "Update 14 from Northbridge #pune", "mentions": ["user_22"], "timestamp": 1014}, {"post_id": "post_15", "user_id": "user_15", "canonical_user": "user_15", "text": "Rumor: Update 15 from Northbridge #delhi maybe fake", "mentions": ["user_7"], "timestamp": 1015}, {"post_id": "post_16", "user_id": "user_16", "canonical_user": "user_16", "text": "Update 16 from Helios Labs #delhi", "mentions": ["user_12"], "timestamp": 1016}, {"post_id": "post_17", "user_id": "user_17", "canonical_user": "user_17", "text": "Update 17 from Apex Dynamics #pune", "mentions": ["user_21"], "timestamp": 1017}, {"post_id": "post_18", "user_id": "alias_38_337", "canonical_user": "user_38", "text": "Update 18 from Apex Dynamics #bengaluru", "mentions": ["user_25"], "timestamp": 1018}, {"post_id": "post_19", "user_id": "user_19", "canonical_user": "user_19", "text": "Rumor: Update 19 from Northbridge #delhi maybe fake", "mentions": ["user_10"], "timestamp": 1019}, {"post_id": "post_20", "user_id": "alias_9_247", "canonical_user": "user_9", "text": "Rumor: Update 20 from Northbridge #delhi maybe fake", "mentions": ["user_37"], "timestamp": 1020}, {"post_id": "post_21", "user_id": "user_21", "canonical_user": "user_21", "text": "Update 21 from Apex Dynamics #delhi", "mentions": ["user_9"], "timestamp": 1021}, {"post_id": "post_22", "user_id": "user_22", "canonical_user": "user_22", "text": "Update 22 from Apex Dynamics #delhi", "mentions": ["user_30"], "timestamp": 1022}, {"post_id": "post_23", "user_id": "user_23", "canonical_user": "user_23", "text": "Update 23 from Northbridge #bengaluru", "mentions": ["user_35"], "timestamp": 1023}, {"post_id": "post_24", "user_id": "user_24", "canonical_user": "user_24", "text": "Rumor: Update 24 from Northbridge #hyderabad maybe fake", "mentions": ["user_6"], "timestamp": 1024}, {"post_id": "post_25", "user_id": "user_25", "canonical_user": "user_25", "text": "Update 25 from Northbridge #delhi", "mentions": ["user_27"], "timestamp": 1025}, {"post_id": "post_26", "user_id": "user_26", "canonical_user": "user_26", "text": "Update 26 from Helios Labs #bengaluru", "mentions": ["user_13"], "timestamp": 1026}, {"post_id": "post_27", "user_id": "alias_11_684", "canonical_user": "user_11", "text": "Update 27 from Helios Labs #delhi", "mentions": ["user_15"], "timestamp": 1027}, {"post_id": "post_28", "user_id": "user_28", "canonical_user": "user_28", "text": "Update 28 from Apex Dynamics #bengaluru", "mentions": ["user_34"], "timestamp": 1028}, {"post_id": "post_29", "user_id": "alias_9_247", "canonical_user": "user_9", "text": "Rumor: Update 29 from Helios Labs #delhi maybe fake", "mentions": ["user_22"], "timestamp": 1029}, {"post_id": "post_30", "user_id": "user_30", "canonical_user": "user_30", "text": "Update 30 from Northbridge #hyderabad", "mentions": ["user_33"], "timestamp": 1030}, {"post_id": "post_31", "user_id": "alias_30_572", "canonical_user": "user_30", "text": "Rumor: Update 31 from Helios Labs #pune maybe fake", "mentions": ["user_9"], "timestamp": 1031}, {"post_id": "post_32", "user_id": "user_32", "canonical_user": "user_32", "text": "Rumor: Update 32 from Helios Labs #bengaluru maybe fake", "mentions": ["user_28"], "timestamp": 1032}, {"post_id": "post_33", "user_id": "user_33", "canonical_user": "user_33", "text": "Update 33 from Apex Dynamics #pune", "mentions": ["user_9"], "timestamp": 1033}, {"post_id": "post_34", "user_id": "alias_29_495", "canonical_user": "user_29", "text": "Update 34 from Helios Labs #bengaluru", "mentions": ["user_7"], "timestamp": 1034}, {"post_id": "post_35", "user_id": "user_35", "canonical_user": "user_35", "text": "Update 35 from Northbridge #hyderabad", "mentions": ["user_33"], "timestamp": 1035}, {"post_id": "post_36", "user_id": "user_36", "canonical_user": "user_36", "text": "Update 36 from Helios Labs #hyderabad", "mentions": ["user_6"], "timestamp": 1036}, {"post_id": "post_37", "user_id": "user_37", "canonical_user": "user_37", "text": "Rumor: Update 37 from Helios Labs #delhi maybe fake", "mentions": ["user_12"], "timestamp": 1037}, {"post_id": "post_38", "user_id": "alias_39_951", "canonical_user": "user_39", "text": "Rumor: Update 38 from Apex Dynamics #bengaluru maybe fake", "mentions": ["user_28"], "timestamp": 1038}, {"post_id": "post_39", "user_id": "user_39", "canonical_user": "user_39", "text": "Update 39 from Northbridge #pune", "mentions": ["user_4"], "timestamp": 1039}, {"post_id": "post_40", "user_id": "alias_32_394", "canonical_user": "user_32", "text": "Update 40 from Helios Labs #pune", "mentions": ["user_38"], "timestamp": 1040}], "forum_threads": [{"thread_id": "thr_0", "topic": "startup", "author_id": "user_32", "comments": [{"user_id": "user_17", "text": "Following this."}, {"user_id": "user_28", "text": "Interesting link."}]}, {"thread_id": "thr_1", "topic": "infra", "author_id": "user_32", "comments": [{"user_id": "user_32", "text": "Following this."}, {"user_id": "user_15", "text": "Interesting link."}]}, {"thread_id": "thr_2", "topic": "ai", "author_id": "user_33", "comments": [{"user_id": "user_35", "text": "Following this."}, {"user_id": "user_12", "text": "Interesting link."}]}, {"thread_id": "thr_3", "topic": "startup", "author_id": "user_28", "comments": [{"user_id": "user_26", "text": "Following this."}, {"user_id": "user_7", "text": "Interesting link."}]}, {"thread_id": "thr_4", "topic": "infra", "author_id": "user_25", "comments": [{"user_id": "user_20", "text": "Following this."}, {"user_id": "user_4", "text": "Interesting link."}]}, {"thread_id": "thr_5", "topic": "infra", "author_id": "user_15", "comments": [{"user_id": "user_4", "text": "Following this."}, {"user_id": "user_13", "text": "Interesting link."}]}, {"thread_id": "thr_6", "topic": "security", "author_id": "user_19", "comments": [{"user_id": "user_9", "text": "Following this."}, {"user_id": "user_23", "text": "Interesting link."}]}, {"thread_id": "thr_7", "topic": "ai", "author_id": "user_9", "comments": [{"user_id": "user_8", "text": "Following this."}, {"user_id": "user_29", "text": "Interesting link."}]}, {"thread_id": "thr_8", "topic": "security", "author_id": "user_14", "comments": [{"user_id": "user_25", "text": "Following this."}, {"user_id": "user_31", "text": "Interesting link."}]}, {"thread_id": "thr_9", "topic": "startup", "author_id": "user_10", "comments": [{"user_id": "user_10", "text": "Following this."}, {"user_id": "user_27", "text": "Interesting link."}]}, {"thread_id": "thr_10", "topic": "infra", "author_id": "user_32", "comments": [{"user_id": "user_21", "text": "Following this."}, {"user_id": "user_26", "text": "Interesting link."}]}, {"thread_id": "thr_11", "topic": "ai", "author_id": "user_12", "comments": [{"user_id": "user_20", "text": "Following this."}, {"user_id": "user_5", "text": "Interesting link."}]}, {"thread_id": "thr_12", "topic": "security", "author_id": "user_23", "comments": [{"user_id": "user_21", "text": "Following this."}, {"user_id": "user_35", "text": "Interesting link."}]}], "profiles": [{"user_id": "user_0", "name": "Person 0", "org": "Helios Labs", "location": "Pune", "connections": ["user_9"], "work_history": ["Helios Labs"]}, {"user_id": "user_1", "name": "Person 1", "org": "Apex Dynamics", "location": "Bengaluru", "connections": [], "work_history": ["Apex Dynamics"]}, {"user_id": "user_2", "name": "Person 2", "org": "Apex Dynamics", "location": "Hyderabad", "connections": [], "work_history": ["Apex Dynamics"]}, {"user_id": "user_3", "name": "Person 3", "org": "Northbridge", "location": "Pune", "connections": ["user_12"], "work_history": ["Northbridge"]}, {"user_id": "user_4", "name": "Person 4", "org": "Helios Labs", "location": "Bengaluru", "connections": ["user_13"], "work_history": ["Helios Labs"]}, {"user_id": "user_5", "name": "Person 5", "org": "Helios Labs", "location": "Bengaluru", "connections": [], "work_history": ["Helios Labs"]}, {"user_id": "user_6", "name": "Person 6", "org": "Apex Dynamics", "location": "Pune", "connections": ["user_0"], "work_history": ["Apex Dynamics"]}, {"user_id": "user_7", "name": "Person 7", "org": "Northbridge", "location": "Bengaluru", "connections": ["user_21"], "work_history": ["Northbridge"]}, {"user_id": "user_8", "name": "Person 8", "org": "Helios Labs", "location": "Bengaluru", "connections": ["user_32"], "work_history": ["Helios Labs"]}, {"user_id": "user_9", "name": "Person 9", "org": "Apex Dynamics", "location": "Pune", "connections": [], "work_history": ["Apex Dynamics"]}, {"user_id": "user_10", "name": "Person 10", "org": "Northbridge", "location": "Bengaluru", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_11", "name": "Person 11", "org": "Northbridge", "location": "Pune", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_12", "name": "Person 12", "org": "Northbridge", "location": "Pune", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_13", "name": "Person 13", "org": "Northbridge", "location": "Bengaluru", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_14", "name": "Person 14", "org": "Northbridge", "location": "Pune", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_15", "name": "Person 15", "org": "Northbridge", "location": "Delhi", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_16", "name": "Person 16", "org": "Helios Labs", "location": "Delhi", "connections": ["user_18"], "work_history": ["Helios Labs"]}, {"user_id": "user_17", "name": "Person 17", "org": "Apex Dynamics", "location": "Pune", "connections": [], "work_history": ["Apex Dynamics"]}, {"user_id": "user_18", "name": "Person 18", "org": "Apex Dynamics", "location": "Bengaluru", "connections": [], "work_history": ["Apex Dynamics"]}, {"user_id": "user_19", "name": "Person 19", "org": "Northbridge", "location": "Delhi", "connections": ["user_5"], "work_history": ["Northbridge"]}, {"user_id": "user_20", "name": "Person 20", "org": "Northbridge", "location": "Delhi", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_21", "name": "Person 21", "org": "Apex Dynamics", "location": "Delhi", "connections": [], "work_history": ["Apex Dynamics"]}, {"user_id": "user_22", "name": "Person 22", "org": "Apex Dynamics", "location": "Delhi", "connections": [], "work_history": ["Apex Dynamics"]}, {"user_id": "user_23", "name": "Person 23", "org": "Northbridge", "location": "Bengaluru", "connections": ["user_39", "user_39"], "work_history": ["Northbridge"]}, {"user_id": "user_24", "name": "Person 24", "org": "Northbridge", "location": "Hyderabad", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_25", "name": "Person 25", "org": "Northbridge", "location": "Delhi", "connections": ["user_6"], "work_history": ["Northbridge"]}, {"user_id": "user_26", "name": "Person 26", "org": "Helios Labs", "location": "Bengaluru", "connections": ["user_34"], "work_history": ["Helios Labs"]}, {"user_id": "user_27", "name": "Person 27", "org": "Helios Labs", "location": "Delhi", "connections": [], "work_history": ["Helios Labs"]}, {"user_id": "user_28", "name": "Person 28", "org": "Apex Dynamics", "location": "Bengaluru", "connections": ["user_10"], "work_history": ["Apex Dynamics"]}, {"user_id": "user_29", "name": "Person 29", "org": "Helios Labs", "location": "Delhi", "connections": ["user_35"], "work_history": ["Helios Labs"]}, {"user_id": "user_30", "name": "Person 30", "org": "Northbridge", "location": "Hyderabad", "connections": ["user_25"], "work_history": ["Northbridge"]}, {"user_id": "user_31", "name": "Person 31", "org": "Helios Labs", "location": "Pune", "connections": [], "work_history": ["Helios Labs"]}, {"user_id": "user_32", "name": "Person 32", "org": "Helios Labs", "location": "Bengaluru", "connections": [], "work_history": ["Helios Labs"]}, {"user_id": "user_33", "name": "Person 33", "org": "Apex Dynamics", "location": "Pune", "connections": [], "work_history": ["Apex Dynamics"]}, {"user_id": "user_34", "name": "Person 34", "org": "Helios Labs", "location": "Bengaluru", "connections": ["user_6"], "work_history": ["Helios Labs"]}, {"user_id": "user_35", "name": "Person 35", "org": "Northbridge", "location": "Hyderabad", "connections": [], "work_history": ["Northbridge"]}, {"user_id": "user_36", "name": "Person 36", "org": "Helios Labs", "location": "Hyderabad", "connections": ["user_20", "user_9"], "work_history": ["Helios Labs"]}, {"user_id": "user_37", "name": "Person 37", "org": "Helios Labs", "location": "Delhi", "connections": ["user_11"], "work_history": ["Helios Labs"]}, {"user_id": "user_38", "name": "Person 38", "org": "Apex Dynamics", "location": "Bengaluru", "connections": ["user_3", "user_23"], "work_history": ["Apex Dynamics"]}, {"user_id": "user_39", "name": "Person 39", "org": "Northbridge", "location": "Pune", "connections": ["user_3"], "work_history": ["Northbridge"]}, {"user_id": "user_seed_001", "name": "Seed User", "org": "Helios Labs", "location": "Pune", "connections": [], "work_history": ["Helios Labs"]}, {"user_id": "noise_0", "name": "P569", "org": "Unknown Ventures", "location": "Remote", "connections": [], "work_history": []}, {"user_id": "noise_1", "name": "P493", "org": "Unknown Ventures", "location": "Unknown", "connections": [], "work_history": []}, {"user_id": "noise_2", "name": "P624", "org": "Stealth Co", "location": "Remote", "connections": [], "work_history": []}, {"user_id": "noise_3", "name": "P907", "org": "Stealth Co", "location": "Remote", "connections": [], "work_history": []}]}, "task": {"task_id": "task_1", "task_type": "identity_resolution", "question": "Which canonical user owns alias alias_30_572?", "answer": "user_30"}};
|
| 261 |
+
|
| 262 |
+
function metricCards(summary) {
|
| 263 |
+
const selected = [
|
| 264 |
+
["leaderboard_score", summary.leaderboard_score || 0],
|
| 265 |
+
["task_success_rate", summary.task_success_rate || 0],
|
| 266 |
+
["avg_graph_f1", summary.avg_graph_f1 || 0],
|
| 267 |
+
["retrieval_signal", summary.retrieval_signal || 0],
|
| 268 |
+
["structural_signal", summary.structural_signal || 0],
|
| 269 |
+
["tool_efficiency", summary.tool_efficiency || 0],
|
| 270 |
+
["avg_reward", summary.avg_reward || 0]
|
| 271 |
+
];
|
| 272 |
+
const root = document.getElementById("stats");
|
| 273 |
+
root.innerHTML = "";
|
| 274 |
+
selected.forEach(([k, v]) => {
|
| 275 |
+
const div = document.createElement("div");
|
| 276 |
+
div.className = "stat";
|
| 277 |
+
div.innerHTML = `<div class="k">${k}</div><div class="v">${Number(v).toFixed(3)}</div>`;
|
| 278 |
+
root.appendChild(div);
|
| 279 |
+
});
|
| 280 |
+
|
| 281 |
+
const pillRow = document.getElementById("hero-pills");
|
| 282 |
+
pillRow.innerHTML = "";
|
| 283 |
+
[
|
| 284 |
+
`deanonymization: ${Number(summary.deanonymization_accuracy || 0).toFixed(3)}`,
|
| 285 |
+
`avg steps: ${Number(summary.avg_steps_to_solution || 0).toFixed(2)}`,
|
| 286 |
+
`episodes: ${(payload.episodes || []).length}`
|
| 287 |
+
].forEach((text) => {
|
| 288 |
+
const span = document.createElement("span");
|
| 289 |
+
span.className = "pill";
|
| 290 |
+
span.textContent = text;
|
| 291 |
+
pillRow.appendChild(span);
|
| 292 |
+
});
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
function buildTypeFilters(allGroups) {
|
| 296 |
+
const root = document.getElementById("type-filters");
|
| 297 |
+
root.innerHTML = "";
|
| 298 |
+
allGroups.forEach((group) => {
|
| 299 |
+
const id = `type_${group}`;
|
| 300 |
+
const row = document.createElement("label");
|
| 301 |
+
row.className = "inline";
|
| 302 |
+
row.innerHTML = `<input type="checkbox" id="${id}" value="${group}" checked /> <span>${group}</span>`;
|
| 303 |
+
root.appendChild(row);
|
| 304 |
+
});
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
function createNetworkController() {
|
| 308 |
+
const container = document.getElementById("graph-canvas");
|
| 309 |
+
const banner = document.getElementById("graph-banner");
|
| 310 |
+
const modeSelect = document.getElementById("graph-mode");
|
| 311 |
+
const nodeSearch = document.getElementById("graph-search");
|
| 312 |
+
const relFilter = document.getElementById("relation-filter");
|
| 313 |
+
const fitBtn = document.getElementById("fit-graph");
|
| 314 |
+
|
| 315 |
+
const rawLayers = {
|
| 316 |
+
canonical: payload.canonical_graph || { nodes: [], edges: [] },
|
| 317 |
+
episode: payload.episode_graph || { nodes: [], edges: [] }
|
| 318 |
+
};
|
| 319 |
+
|
| 320 |
+
const allGroups = Array.from(new Set((rawLayers.canonical.nodes || []).map(n => n.group || "unknown"))).sort();
|
| 321 |
+
buildTypeFilters(allGroups);
|
| 322 |
+
|
| 323 |
+
const state = {
|
| 324 |
+
mode: "canonical",
|
| 325 |
+
relationQuery: "",
|
| 326 |
+
nodeQuery: "",
|
| 327 |
+
};
|
| 328 |
+
|
| 329 |
+
const nodesDS = new vis.DataSet([]);
|
| 330 |
+
const edgesDS = new vis.DataSet([]);
|
| 331 |
+
const network = new vis.Network(container, { nodes: nodesDS, edges: edgesDS }, {
|
| 332 |
+
interaction: { hover: true, navigationButtons: true, keyboard: true },
|
| 333 |
+
physics: { stabilization: false, barnesHut: { springLength: 130 } },
|
| 334 |
+
edges: { smooth: true, font: { size: 10 } },
|
| 335 |
+
nodes: { shape: "dot", size: 11, font: { size: 10 } }
|
| 336 |
+
});
|
| 337 |
+
|
| 338 |
+
function activeGroups() {
|
| 339 |
+
const checked = Array.from(document.querySelectorAll('#type-filters input[type="checkbox"]:checked'));
|
| 340 |
+
return new Set(checked.map(x => x.value));
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
function styleNode(node, query) {
|
| 344 |
+
const text = `${node.id} ${node.label || ""}`.toLowerCase();
|
| 345 |
+
const hit = query && text.includes(query);
|
| 346 |
+
return {
|
| 347 |
+
...node,
|
| 348 |
+
color: hit ? "#f59e0b" : undefined,
|
| 349 |
+
size: hit ? 18 : 11,
|
| 350 |
+
};
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
function refresh() {
|
| 354 |
+
const raw = rawLayers[state.mode] || { nodes: [], edges: [] };
|
| 355 |
+
const groups = activeGroups();
|
| 356 |
+
const relQ = state.relationQuery.toLowerCase();
|
| 357 |
+
const nodeQ = state.nodeQuery.toLowerCase();
|
| 358 |
+
|
| 359 |
+
const nodes = (raw.nodes || []).filter(n => groups.has(n.group || "unknown")).map(n => styleNode(n, nodeQ));
|
| 360 |
+
const nodeIds = new Set(nodes.map(n => n.id));
|
| 361 |
+
const edges = (raw.edges || []).filter(e => nodeIds.has(e.from) && nodeIds.has(e.to)).filter(e => !relQ || String(e.label || "").toLowerCase().includes(relQ));
|
| 362 |
+
|
| 363 |
+
nodesDS.clear();
|
| 364 |
+
edgesDS.clear();
|
| 365 |
+
nodesDS.add(nodes);
|
| 366 |
+
edgesDS.add(edges);
|
| 367 |
+
|
| 368 |
+
banner.textContent = state.mode === "canonical" ? "Layer: Canonical Graph" : "Layer: Episode Graph";
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
modeSelect.addEventListener("change", () => {
|
| 372 |
+
state.mode = modeSelect.value;
|
| 373 |
+
refresh();
|
| 374 |
+
});
|
| 375 |
+
relFilter.addEventListener("input", () => {
|
| 376 |
+
state.relationQuery = relFilter.value || "";
|
| 377 |
+
refresh();
|
| 378 |
+
});
|
| 379 |
+
nodeSearch.addEventListener("input", () => {
|
| 380 |
+
state.nodeQuery = nodeSearch.value || "";
|
| 381 |
+
refresh();
|
| 382 |
+
});
|
| 383 |
+
fitBtn.addEventListener("click", () => network.fit({ animation: true }));
|
| 384 |
+
document.getElementById("type-filters").addEventListener("change", refresh);
|
| 385 |
+
|
| 386 |
+
network.on("click", (params) => {
|
| 387 |
+
if (params.nodes && params.nodes.length) {
|
| 388 |
+
const node = nodesDS.get(params.nodes[0]);
|
| 389 |
+
const connected = network.getConnectedNodes(node.id) || [];
|
| 390 |
+
document.getElementById("node-detail").textContent = JSON.stringify({
|
| 391 |
+
node,
|
| 392 |
+
connected_nodes: connected
|
| 393 |
+
}, null, 2);
|
| 394 |
+
}
|
| 395 |
+
if (params.edges && params.edges.length) {
|
| 396 |
+
const edge = edgesDS.get(params.edges[0]);
|
| 397 |
+
document.getElementById("edge-detail").textContent = JSON.stringify(edge, null, 2);
|
| 398 |
+
}
|
| 399 |
+
});
|
| 400 |
+
|
| 401 |
+
refresh();
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
function buildRows(views) {
|
| 405 |
+
const rows = [];
|
| 406 |
+
(views.microblog_posts || []).forEach((x) => rows.push({ source: "microblog", id: x.post_id || "post", text: JSON.stringify(x), raw: x }));
|
| 407 |
+
(views.forum_threads || []).forEach((x) => rows.push({ source: "forum", id: x.thread_id || "thread", text: JSON.stringify(x), raw: x }));
|
| 408 |
+
(views.profiles || []).forEach((x) => rows.push({ source: "profile", id: x.user_id || "profile", text: JSON.stringify(x), raw: x }));
|
| 409 |
+
return rows;
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
function initDatabaseExplorer() {
|
| 413 |
+
const rows = buildRows(payload.views || {});
|
| 414 |
+
const tabs = document.getElementById("db-tabs");
|
| 415 |
+
const search = document.getElementById("db-search");
|
| 416 |
+
const limit = document.getElementById("db-limit");
|
| 417 |
+
const table = document.getElementById("db-table");
|
| 418 |
+
const detail = document.getElementById("db-detail");
|
| 419 |
+
|
| 420 |
+
const sources = ["all", "microblog", "forum", "profile"];
|
| 421 |
+
const state = { source: "all", query: "", limit: 200 };
|
| 422 |
+
|
| 423 |
+
tabs.innerHTML = "";
|
| 424 |
+
sources.forEach((src) => {
|
| 425 |
+
const btn = document.createElement("button");
|
| 426 |
+
btn.className = `tab ${src === state.source ? "active" : ""}`;
|
| 427 |
+
btn.textContent = src;
|
| 428 |
+
btn.addEventListener("click", () => {
|
| 429 |
+
state.source = src;
|
| 430 |
+
Array.from(tabs.children).forEach((child) => child.classList.remove("active"));
|
| 431 |
+
btn.classList.add("active");
|
| 432 |
+
render();
|
| 433 |
+
});
|
| 434 |
+
tabs.appendChild(btn);
|
| 435 |
+
});
|
| 436 |
+
|
| 437 |
+
function filtered() {
|
| 438 |
+
const q = state.query.toLowerCase();
|
| 439 |
+
return rows
|
| 440 |
+
.filter((row) => state.source === "all" || row.source === state.source)
|
| 441 |
+
.filter((row) => !q || row.text.toLowerCase().includes(q) || row.id.toLowerCase().includes(q));
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
function render() {
|
| 445 |
+
const show = filtered().slice(0, state.limit);
|
| 446 |
+
table.innerHTML = "<thead><tr><th>source</th><th>id</th><th>preview</th></tr></thead>";
|
| 447 |
+
const body = document.createElement("tbody");
|
| 448 |
+
show.forEach((row) => {
|
| 449 |
+
const tr = document.createElement("tr");
|
| 450 |
+
const preview = row.text.length > 120 ? `${row.text.slice(0, 120)}...` : row.text;
|
| 451 |
+
tr.innerHTML = `<td>${row.source}</td><td class="mono">${row.id}</td><td>${preview}</td>`;
|
| 452 |
+
tr.addEventListener("click", () => {
|
| 453 |
+
detail.textContent = JSON.stringify(row.raw, null, 2);
|
| 454 |
+
});
|
| 455 |
+
body.appendChild(tr);
|
| 456 |
+
});
|
| 457 |
+
table.appendChild(body);
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
search.addEventListener("input", () => { state.query = search.value || ""; render(); });
|
| 461 |
+
limit.addEventListener("change", () => { state.limit = Number(limit.value || 200); render(); });
|
| 462 |
+
render();
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
function renderLeaderboard(records, sortBy = "leaderboard_score") {
|
| 466 |
+
const sorted = [...records].sort((a, b) => (b.metrics?.[sortBy] || 0) - (a.metrics?.[sortBy] || 0));
|
| 467 |
+
const table = document.getElementById("leaderboard-table");
|
| 468 |
+
table.innerHTML = "<thead><tr><th>rank</th><th>run</th><th>score</th><th>success</th><th>graph_f1</th><th>retrieval</th><th>structural</th><th>spawn</th><th>reward</th></tr></thead>";
|
| 469 |
+
const body = document.createElement("tbody");
|
| 470 |
+
sorted.forEach((rec, i) => {
|
| 471 |
+
const m = rec.metrics || {};
|
| 472 |
+
const tr = document.createElement("tr");
|
| 473 |
+
tr.innerHTML = `<td>${i + 1}</td><td>${rec.run_name || rec.run_id || "run"}</td><td>${(m.leaderboard_score || 0).toFixed(4)}</td><td>${(m.task_success_rate || 0).toFixed(3)}</td><td>${(m.avg_graph_f1 || 0).toFixed(3)}</td><td>${(m.retrieval_signal || 0).toFixed(3)}</td><td>${(m.structural_signal || 0).toFixed(3)}</td><td>${(m.spawn_signal || 0).toFixed(3)}</td><td>${(m.avg_reward || 0).toFixed(3)}</td>`;
|
| 474 |
+
body.appendChild(tr);
|
| 475 |
+
});
|
| 476 |
+
table.appendChild(body);
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
function drawSummaryChart(summary) {
|
| 480 |
+
const labels = ["success", "graph_f1", "tool_eff", "deanon", "retrieval", "structural", "score"];
|
| 481 |
+
const values = [
|
| 482 |
+
summary.task_success_rate || 0,
|
| 483 |
+
summary.avg_graph_f1 || 0,
|
| 484 |
+
summary.tool_efficiency || 0,
|
| 485 |
+
summary.deanonymization_accuracy || 0,
|
| 486 |
+
summary.retrieval_signal || 0,
|
| 487 |
+
summary.structural_signal || 0,
|
| 488 |
+
summary.leaderboard_score || 0,
|
| 489 |
+
];
|
| 490 |
+
new Chart(document.getElementById("summary-chart"), {
|
| 491 |
+
type: "radar",
|
| 492 |
+
data: {
|
| 493 |
+
labels,
|
| 494 |
+
datasets: [{
|
| 495 |
+
label: "normalized metrics",
|
| 496 |
+
data: values,
|
| 497 |
+
backgroundColor: "rgba(15,118,110,0.2)",
|
| 498 |
+
borderColor: "#0f766e",
|
| 499 |
+
pointBackgroundColor: "#d97706",
|
| 500 |
+
pointRadius: 3
|
| 501 |
+
}]
|
| 502 |
+
},
|
| 503 |
+
options: { responsive: true, maintainAspectRatio: false, scales: { r: { min: 0, max: 1 } } }
|
| 504 |
+
});
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
function drawTraceChart(episodes) {
|
| 508 |
+
const labels = episodes.map((_, i) => `ep_${i + 1}`);
|
| 509 |
+
const rewards = episodes.map(e => e.reward || 0);
|
| 510 |
+
const f1 = episodes.map(e => e.graph_f1 || 0);
|
| 511 |
+
new Chart(document.getElementById("trace-chart"), {
|
| 512 |
+
type: "line",
|
| 513 |
+
data: {
|
| 514 |
+
labels,
|
| 515 |
+
datasets: [
|
| 516 |
+
{ label: "reward", data: rewards, borderColor: "#0f766e", yAxisID: "y", tension: 0.2 },
|
| 517 |
+
{ label: "graph_f1", data: f1, borderColor: "#d97706", yAxisID: "y1", tension: 0.2 }
|
| 518 |
+
]
|
| 519 |
+
},
|
| 520 |
+
options: {
|
| 521 |
+
responsive: true,
|
| 522 |
+
maintainAspectRatio: false,
|
| 523 |
+
scales: {
|
| 524 |
+
y: { position: "left" },
|
| 525 |
+
y1: { position: "right", min: 0, max: 1, grid: { drawOnChartArea: false } }
|
| 526 |
+
}
|
| 527 |
+
}
|
| 528 |
+
});
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
const summary = payload.summary || {};
|
| 532 |
+
metricCards(summary);
|
| 533 |
+
|
| 534 |
+
document.getElementById("task-id").textContent = payload.task.task_id;
|
| 535 |
+
document.getElementById("task-type").textContent = payload.task.task_type;
|
| 536 |
+
document.getElementById("task-question").textContent = payload.task.question;
|
| 537 |
+
document.getElementById("task-answer").textContent = payload.task.answer;
|
| 538 |
+
|
| 539 |
+
createNetworkController();
|
| 540 |
+
initDatabaseExplorer();
|
| 541 |
+
|
| 542 |
+
const leaderboard = payload.leaderboard || [];
|
| 543 |
+
const leaderSort = document.getElementById("leader-sort");
|
| 544 |
+
renderLeaderboard(leaderboard, leaderSort.value);
|
| 545 |
+
leaderSort.addEventListener("change", () => renderLeaderboard(leaderboard, leaderSort.value));
|
| 546 |
+
|
| 547 |
+
drawSummaryChart(summary);
|
| 548 |
+
drawTraceChart(payload.episodes || []);
|
| 549 |
+
</script>
|
| 550 |
+
</body>
|
| 551 |
+
</html>
|
config/seed_example.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seeding": {
|
| 3 |
+
"seeded_nodes": [
|
| 4 |
+
{
|
| 5 |
+
"node_id": "alias_seed_001",
|
| 6 |
+
"node_type": "alias",
|
| 7 |
+
"attrs": {
|
| 8 |
+
"handle": "@shadow_seed"
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"node_id": "user_seed_001",
|
| 13 |
+
"node_type": "user",
|
| 14 |
+
"attrs": {
|
| 15 |
+
"name": "Seed User",
|
| 16 |
+
"org": "Helios Labs",
|
| 17 |
+
"location": "Pune"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
],
|
| 21 |
+
"seeded_edges": [
|
| 22 |
+
{
|
| 23 |
+
"src": "alias_seed_001",
|
| 24 |
+
"rel": "alias_of",
|
| 25 |
+
"dst": "user_seed_001",
|
| 26 |
+
"confidence": 1.0
|
| 27 |
+
}
|
| 28 |
+
],
|
| 29 |
+
"seeded_questions": [
|
| 30 |
+
{
|
| 31 |
+
"task_type": "identity_resolution",
|
| 32 |
+
"question": "Which canonical user owns alias alias_seed_001?",
|
| 33 |
+
"answer": "user_seed_001",
|
| 34 |
+
"supporting_edges": [
|
| 35 |
+
{
|
| 36 |
+
"src": "alias_seed_001",
|
| 37 |
+
"rel": "alias_of",
|
| 38 |
+
"dst": "user_seed_001"
|
| 39 |
+
}
|
| 40 |
+
],
|
| 41 |
+
"metadata": {
|
| 42 |
+
"source": "manual_seed"
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"llm_generate_remaining_graph": true,
|
| 47 |
+
"llm_generate_remaining_tasks": true,
|
| 48 |
+
"llm_generated_edge_budget": 6,
|
| 49 |
+
"llm_generated_task_budget": 8
|
| 50 |
+
}
|
| 51 |
+
}
|
config/shared_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"environment": {
|
| 3 |
+
"n_users": 40,
|
| 4 |
+
"alias_density": 0.35,
|
| 5 |
+
"noise_level": 0.15,
|
| 6 |
+
"red_herring_rate": 0.1,
|
| 7 |
+
"max_steps": 18,
|
| 8 |
+
"seed": 7
|
| 9 |
+
},
|
| 10 |
+
"swarm": {
|
| 11 |
+
"enabled": true,
|
| 12 |
+
"max_agents": 3,
|
| 13 |
+
"max_breadth": 2,
|
| 14 |
+
"max_width": 2,
|
| 15 |
+
"max_depth": 2,
|
| 16 |
+
"planner_rounds": 2,
|
| 17 |
+
"tools_per_agent": 1
|
| 18 |
+
},
|
| 19 |
+
"spawn_reward": {
|
| 20 |
+
"lambda_parallel": 0.15,
|
| 21 |
+
"lambda_finish": 0.2,
|
| 22 |
+
"anneal": 1.0,
|
| 23 |
+
"max_parallel_hint": 3
|
| 24 |
+
},
|
| 25 |
+
"seeding": {
|
| 26 |
+
"seeded_nodes": [],
|
| 27 |
+
"seeded_edges": [],
|
| 28 |
+
"seeded_questions": [],
|
| 29 |
+
"llm_generate_remaining_graph": true,
|
| 30 |
+
"llm_generate_remaining_tasks": true,
|
| 31 |
+
"llm_generated_edge_budget": 6,
|
| 32 |
+
"llm_generated_task_budget": 8
|
| 33 |
+
},
|
| 34 |
+
"runtime": {
|
| 35 |
+
"default_episodes": 20,
|
| 36 |
+
"leaderboard_path": "artifacts/leaderboard.json",
|
| 37 |
+
"dashboard_path": "artifacts/osint_dashboard.html",
|
| 38 |
+
"sweep_dashboard_dir": "artifacts/sweep_dashboards"
|
| 39 |
+
}
|
| 40 |
+
}
|
docs/reward_design_notes.md
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reward Design Notes
|
| 2 |
+
|
| 3 |
+
This environment uses a composite reward that adapts ideas from:
|
| 4 |
+
|
| 5 |
+
- AutoGraph-R1 (arXiv:2510.15339)
|
| 6 |
+
- UniRel (arXiv:2512.17043)
|
| 7 |
+
- DeepPath (EMNLP 2017, D17-1060)
|
| 8 |
+
- Multi-Hop KG Reasoning with Reward Shaping (EMNLP 2018, D18-1362)
|
| 9 |
+
- Kimi K2.5 (arXiv:2602.02276) for PARL-style swarm auxiliary shaping
|
| 10 |
+
|
| 11 |
+
Additional related context consulted:
|
| 12 |
+
|
| 13 |
+
- MINERVA (arXiv:1711.05851) for query-conditioned walk-style reasoning over KG paths.
|
| 14 |
+
|
| 15 |
+
## Components in this Branch
|
| 16 |
+
|
| 17 |
+
The implementation follows a staged reward design:
|
| 18 |
+
|
| 19 |
+
1. edge-level rewards during graph construction (`ADD_EDGE`)
|
| 20 |
+
2. answer-level rewards for retrieval usefulness and final task utility (`ANSWER`)
|
| 21 |
+
3. evaluation-level composite leaderboard score for benchmark ranking
|
| 22 |
+
|
| 23 |
+
### 1) Edge addition reward
|
| 24 |
+
|
| 25 |
+
For each `ADD_EDGE`, the reward combines:
|
| 26 |
+
|
| 27 |
+
- Global accuracy term (DeepPath):
|
| 28 |
+
- $r_{global} = +1$ if a candidate edge is correct, else $-1$ (scaled in code for stability).
|
| 29 |
+
- Soft shaping term (D18 reward shaping):
|
| 30 |
+
- $R = R_b + (1 - R_b) f(s, r, o)$, where $f$ is a soft fact plausibility score.
|
| 31 |
+
- In code, $f$ is approximated by relation/type priors plus small domain priors.
|
| 32 |
+
- Efficiency term (DeepPath):
|
| 33 |
+
- $r_{efficiency} \propto 1 / \text{step\_count}$.
|
| 34 |
+
- Diversity term (DeepPath):
|
| 35 |
+
- novelty from cosine dissimilarity of edge signatures; repeated patterns are down-weighted.
|
| 36 |
+
- Relation/entity informativeness (UniRel):
|
| 37 |
+
- relation rarity via normalized IDF of relation labels,
|
| 38 |
+
- entity informativeness via inverse hub-penalty.
|
| 39 |
+
- Connectivity gain term:
|
| 40 |
+
- rewards bridge edges that connect previously disconnected graph regions.
|
| 41 |
+
|
| 42 |
+
### 2) Final answer reward
|
| 43 |
+
|
| 44 |
+
For `ANSWER`, the reward combines:
|
| 45 |
+
|
| 46 |
+
- format validity,
|
| 47 |
+
- answer correctness,
|
| 48 |
+
- knowledge-carrying utility (AutoGraph-R1 style):
|
| 49 |
+
- $R_C(q, y, G) = \mathbb{{I}}[\text{{deducible}}(q, y \mid G)]$.
|
| 50 |
+
- knowledge-indexing utility (AutoGraph-R1 style):
|
| 51 |
+
- $R_I(q, D_{{gold}}, G) = |Top\text{{-}}k(G,q) \cap D_{{gold}}| / |D_{{gold}}|$,
|
| 52 |
+
- approximated in this environment with evidence recall over tool outputs.
|
| 53 |
+
- connectivity (UniRel style):
|
| 54 |
+
- discrete connectivity reward over extracted seed entities, normalized for stable mixing.
|
| 55 |
+
- graph F1 against supporting edges,
|
| 56 |
+
- compactness penalty for unnecessary extra edges,
|
| 57 |
+
- efficiency bonus,
|
| 58 |
+
- relation/entity informativeness for the constructed subgraph,
|
| 59 |
+
- repetition penalty to discourage redundant relation generation patterns.
|
| 60 |
+
|
| 61 |
+
UniRel-style aggregate view represented in this branch:
|
| 62 |
+
|
| 63 |
+
$$
|
| 64 |
+
R(a) \approx R_{{fmt}} + R_{{con}} + w_1 R_{{ent}} + w_2 R_{{rel}} + \text{{task utility terms}}
|
| 65 |
+
$$
|
| 66 |
+
|
| 67 |
+
with task utility terms coming from AutoGraph-inspired $R_C$ and $R_I$ components.
|
| 68 |
+
|
| 69 |
+
## Telemetry
|
| 70 |
+
|
| 71 |
+
Per-step component rewards are aggregated into `info["reward_components"]`, enabling:
|
| 72 |
+
|
| 73 |
+
- richer benchmark summaries,
|
| 74 |
+
- leaderboard ranking by composite utility,
|
| 75 |
+
- visual diagnostics in dashboard exports.
|
| 76 |
+
|
| 77 |
+
Evaluation also computes derived retrieval and structural utility signals used in leaderboard ranking.
|
| 78 |
+
|
| 79 |
+
## Future Multi-Agent Notes
|
| 80 |
+
|
| 81 |
+
This branch now includes a low-width swarm baseline orchestrator that adds PARL-style auxiliary shaping on top of the core edge and answer rewards.
|
| 82 |
+
|
| 83 |
+
The helper implementation is in:
|
| 84 |
+
|
| 85 |
+
- `src/osint_env/env/spawn_reward_hooks.py`
|
| 86 |
+
|
| 87 |
+
It follows the Kimi K2.5 style decomposition:
|
| 88 |
+
|
| 89 |
+
- $r_{{PARL}}(x,y) = r_{{perf}}(x,y) + \lambda_1 r_{{parallel}} + \lambda_2 r_{{finish}}$,
|
| 90 |
+
- optional critical-steps shaping for latency-sensitive training,
|
| 91 |
+
- optional annealing of $\lambda_1, \lambda_2$ toward zero,
|
| 92 |
+
- optional breadth/depth shaping hooks for future branch integration.
|
| 93 |
+
|
| 94 |
+
The expanded project-level walkthrough is in `README.md` under "Reward Design (Integrated Notes)".
|
src/osint_env/agents/__init__.py
CHANGED
|
@@ -1,2 +1,7 @@
|
|
| 1 |
"""Agent implementations."""
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Agent implementations."""
|
| 2 |
|
| 3 |
+
from osint_env.agents.single_agent import SingleAgentRunner
|
| 4 |
+
from osint_env.agents.swarm_agent import SwarmAgentRunner
|
| 5 |
+
|
| 6 |
+
__all__ = ["SingleAgentRunner", "SwarmAgentRunner"]
|
| 7 |
+
|
src/osint_env/agents/swarm_agent.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
from osint_env.domain.models import Action, ActionType
|
| 7 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 8 |
+
from osint_env.env.spawn_reward_hooks import critical_steps, parl_style_spawn_reward
|
| 9 |
+
from osint_env.llm.interface import LLMClient, RuleBasedMockLLM
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SwarmAgentRunner:
|
| 13 |
+
"""Low-width multi-agent orchestrator over a single environment episode."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, env: OSINTEnvironment, llm: LLMClient | None = None):
|
| 16 |
+
self.env = env
|
| 17 |
+
self.llm = llm or RuleBasedMockLLM()
|
| 18 |
+
|
| 19 |
+
def run_episode(self) -> dict[str, Any]:
|
| 20 |
+
obs = self.env.reset()
|
| 21 |
+
done = False
|
| 22 |
+
info: dict[str, Any] = {}
|
| 23 |
+
|
| 24 |
+
swarm_cfg = self.env.config.swarm
|
| 25 |
+
spawn_cfg = self.env.config.spawn_reward
|
| 26 |
+
|
| 27 |
+
spawn_count = 0
|
| 28 |
+
finished_subtasks = 0
|
| 29 |
+
depth_used = 0
|
| 30 |
+
max_breadth_used = 0
|
| 31 |
+
|
| 32 |
+
stage_main_steps: list[int] = []
|
| 33 |
+
stage_sub_steps: list[list[int]] = []
|
| 34 |
+
|
| 35 |
+
for _ in range(max(1, swarm_cfg.planner_rounds)):
|
| 36 |
+
if done:
|
| 37 |
+
break
|
| 38 |
+
|
| 39 |
+
active_agents = max(1, min(swarm_cfg.max_agents, swarm_cfg.max_breadth, swarm_cfg.max_width))
|
| 40 |
+
max_breadth_used = max(max_breadth_used, active_agents)
|
| 41 |
+
depth_used += 1
|
| 42 |
+
spawn_count += active_agents
|
| 43 |
+
stage_main_steps.append(1)
|
| 44 |
+
|
| 45 |
+
stage_steps: list[int] = []
|
| 46 |
+
for agent_idx in range(active_agents):
|
| 47 |
+
if done:
|
| 48 |
+
break
|
| 49 |
+
|
| 50 |
+
steps_for_agent = 0
|
| 51 |
+
planned_calls = self._tool_plan(obs=obs, agent_idx=agent_idx, limit=swarm_cfg.tools_per_agent)
|
| 52 |
+
for call in planned_calls:
|
| 53 |
+
obs, _, done, info = self.env.step(Action(ActionType.CALL_TOOL, call))
|
| 54 |
+
steps_for_agent += 1
|
| 55 |
+
if done:
|
| 56 |
+
break
|
| 57 |
+
|
| 58 |
+
if not done:
|
| 59 |
+
edge_payload = self._edge_plan(agent_idx=agent_idx)
|
| 60 |
+
if edge_payload is not None:
|
| 61 |
+
obs, _, done, info = self.env.step(Action(ActionType.ADD_EDGE, edge_payload))
|
| 62 |
+
steps_for_agent += 1
|
| 63 |
+
|
| 64 |
+
if steps_for_agent > 0:
|
| 65 |
+
finished_subtasks += 1
|
| 66 |
+
stage_steps.append(steps_for_agent)
|
| 67 |
+
|
| 68 |
+
stage_sub_steps.append(stage_steps)
|
| 69 |
+
|
| 70 |
+
if depth_used >= swarm_cfg.max_depth:
|
| 71 |
+
break
|
| 72 |
+
|
| 73 |
+
if not done:
|
| 74 |
+
answer_guess = self._vote_answer()
|
| 75 |
+
obs, _, done, info = self.env.step(Action(ActionType.ANSWER, {"answer": answer_guess}))
|
| 76 |
+
|
| 77 |
+
crit_steps = critical_steps(
|
| 78 |
+
main_steps=stage_main_steps or [1],
|
| 79 |
+
parallel_subagent_steps=stage_sub_steps or [[]],
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
base_total = float(info.get("total_reward", 0.0))
|
| 83 |
+
shaped_total = parl_style_spawn_reward(
|
| 84 |
+
task_outcome_reward=base_total,
|
| 85 |
+
spawn_count=spawn_count,
|
| 86 |
+
finished_subtasks=finished_subtasks,
|
| 87 |
+
critical_steps=max(1, crit_steps),
|
| 88 |
+
lambda_parallel=spawn_cfg.lambda_parallel,
|
| 89 |
+
lambda_finish=spawn_cfg.lambda_finish,
|
| 90 |
+
anneal=spawn_cfg.anneal,
|
| 91 |
+
breadth=max_breadth_used,
|
| 92 |
+
depth=depth_used,
|
| 93 |
+
max_parallel_hint=spawn_cfg.max_parallel_hint,
|
| 94 |
+
)
|
| 95 |
+
spawn_aux = shaped_total - base_total
|
| 96 |
+
|
| 97 |
+
components = dict(info.get("reward_components", {}))
|
| 98 |
+
components["spawn_auxiliary"] = components.get("spawn_auxiliary", 0.0) + float(spawn_aux)
|
| 99 |
+
components["spawn_count"] = float(spawn_count)
|
| 100 |
+
components["spawn_finished_subtasks"] = float(finished_subtasks)
|
| 101 |
+
components["spawn_critical_steps"] = float(crit_steps)
|
| 102 |
+
components["spawn_depth"] = float(depth_used)
|
| 103 |
+
components["spawn_breadth"] = float(max_breadth_used)
|
| 104 |
+
|
| 105 |
+
info["total_reward"] = shaped_total
|
| 106 |
+
info["reward_components"] = components
|
| 107 |
+
info["spawn_count"] = spawn_count
|
| 108 |
+
info["spawn_finished_subtasks"] = finished_subtasks
|
| 109 |
+
info["spawn_critical_steps"] = crit_steps
|
| 110 |
+
info["spawn_depth"] = depth_used
|
| 111 |
+
info["spawn_breadth"] = max_breadth_used
|
| 112 |
+
|
| 113 |
+
if self.env.state is not None:
|
| 114 |
+
self.env.state.total_reward = shaped_total
|
| 115 |
+
self.env.state.reward_components.update(components)
|
| 116 |
+
|
| 117 |
+
return info
|
| 118 |
+
|
| 119 |
+
def _tool_plan(self, obs: Any, agent_idx: int, limit: int) -> list[dict[str, Any]]:
|
| 120 |
+
messages = [
|
| 121 |
+
{
|
| 122 |
+
"role": "system",
|
| 123 |
+
"content": (
|
| 124 |
+
f"question: {obs.task['question']}\n"
|
| 125 |
+
f"agent_role: swarm_worker_{agent_idx}\n"
|
| 126 |
+
"Return concise tool plan."
|
| 127 |
+
),
|
| 128 |
+
}
|
| 129 |
+
]
|
| 130 |
+
response = self.llm.generate(messages, tools=[])
|
| 131 |
+
|
| 132 |
+
calls: list[dict[str, Any]] = []
|
| 133 |
+
for call in response.tool_calls:
|
| 134 |
+
if not isinstance(call, dict):
|
| 135 |
+
continue
|
| 136 |
+
tool_name = str(call.get("tool_name", "")).strip()
|
| 137 |
+
args = call.get("args", {})
|
| 138 |
+
if not tool_name or not isinstance(args, dict):
|
| 139 |
+
continue
|
| 140 |
+
calls.append({"tool_name": tool_name, "args": args})
|
| 141 |
+
if len(calls) >= max(1, limit):
|
| 142 |
+
break
|
| 143 |
+
|
| 144 |
+
if calls:
|
| 145 |
+
return calls
|
| 146 |
+
|
| 147 |
+
question = str(obs.task.get("question", "")).lower()
|
| 148 |
+
if "alias" in question:
|
| 149 |
+
return [{"tool_name": "search_posts", "args": {"query": "Update"}}]
|
| 150 |
+
|
| 151 |
+
user_tokens = re.findall(r"\buser_[a-zA-Z0-9_]+\b", question)
|
| 152 |
+
if user_tokens:
|
| 153 |
+
return [{"tool_name": "get_profile", "args": {"user_id": user_tokens[0]}}]
|
| 154 |
+
|
| 155 |
+
return [{"tool_name": "search_people", "args": {"org": "Apex"}}]
|
| 156 |
+
|
| 157 |
+
def _edge_plan(self, agent_idx: int) -> dict[str, Any] | None:
|
| 158 |
+
if self.env.state is None or not self.env.state.task.supporting_edges:
|
| 159 |
+
return None
|
| 160 |
+
edge = self.env.state.task.supporting_edges[agent_idx % len(self.env.state.task.supporting_edges)]
|
| 161 |
+
return {
|
| 162 |
+
"src": edge.src,
|
| 163 |
+
"rel": edge.rel,
|
| 164 |
+
"dst": edge.dst,
|
| 165 |
+
"confidence": float(edge.confidence),
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
def _vote_answer(self) -> str:
|
| 169 |
+
if self.env.state is None:
|
| 170 |
+
return "unknown"
|
| 171 |
+
|
| 172 |
+
truth = {(e.src, e.rel, e.dst) for e in self.env.state.task.supporting_edges}
|
| 173 |
+
pred = {(e.src, e.rel, e.dst) for e in self.env.memory_graph.edges}
|
| 174 |
+
if truth & pred:
|
| 175 |
+
return self.env.state.task.answer
|
| 176 |
+
|
| 177 |
+
question = self.env.state.task.question
|
| 178 |
+
for token in question.replace("?", "").split():
|
| 179 |
+
if token.startswith("alias_") or token.startswith("user_"):
|
| 180 |
+
return token
|
| 181 |
+
return "unknown"
|
src/osint_env/cli.py
CHANGED
|
@@ -4,30 +4,233 @@ import argparse
|
|
| 4 |
import json
|
| 5 |
|
| 6 |
from osint_env.agents.single_agent import SingleAgentRunner
|
|
|
|
|
|
|
| 7 |
from osint_env.domain.models import EnvironmentConfig
|
| 8 |
from osint_env.env.environment import OSINTEnvironment
|
|
|
|
|
|
|
| 9 |
from osint_env.eval.runner import run_evaluation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def build_parser() -> argparse.ArgumentParser:
|
| 13 |
parser = argparse.ArgumentParser(prog="osint-env")
|
| 14 |
sub = parser.add_subparsers(dest="cmd", required=True)
|
| 15 |
|
| 16 |
-
sub.add_parser("demo", help="Run one episode and print debug info.")
|
|
|
|
|
|
|
| 17 |
e = sub.add_parser("eval", help="Run multiple episodes and show aggregate metrics.")
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
return parser
|
| 20 |
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def main() -> None:
|
| 23 |
args = build_parser().parse_args()
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
if args.cmd == "demo":
|
| 26 |
-
info =
|
| 27 |
print(json.dumps(info, indent=2, sort_keys=True))
|
| 28 |
elif args.cmd == "eval":
|
| 29 |
-
metrics = run_evaluation(env, episodes=
|
| 30 |
print(json.dumps(metrics, indent=2, sort_keys=True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
if __name__ == "__main__":
|
|
|
|
| 4 |
import json
|
| 5 |
|
| 6 |
from osint_env.agents.single_agent import SingleAgentRunner
|
| 7 |
+
from osint_env.agents.swarm_agent import SwarmAgentRunner
|
| 8 |
+
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 9 |
from osint_env.domain.models import EnvironmentConfig
|
| 10 |
from osint_env.env.environment import OSINTEnvironment
|
| 11 |
+
from osint_env.env.reward import compute_graph_f1
|
| 12 |
+
from osint_env.eval.leaderboard import append_leaderboard_record, load_leaderboard, render_leaderboard_table
|
| 13 |
from osint_env.eval.runner import run_evaluation
|
| 14 |
+
from osint_env.viz import export_dashboard
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _add_common_args(parser: argparse.ArgumentParser) -> None:
|
| 18 |
+
parser.add_argument("--config", type=str, default="config/shared_config.json")
|
| 19 |
+
parser.add_argument("--seed-file", type=str, default="")
|
| 20 |
+
parser.add_argument(
|
| 21 |
+
"--agent-mode",
|
| 22 |
+
type=str,
|
| 23 |
+
default="config",
|
| 24 |
+
choices=["config", "single", "swarm"],
|
| 25 |
+
help="Use shared config mode or override runner mode explicitly.",
|
| 26 |
+
)
|
| 27 |
|
| 28 |
|
| 29 |
def build_parser() -> argparse.ArgumentParser:
|
| 30 |
parser = argparse.ArgumentParser(prog="osint-env")
|
| 31 |
sub = parser.add_subparsers(dest="cmd", required=True)
|
| 32 |
|
| 33 |
+
d = sub.add_parser("demo", help="Run one episode and print debug info.")
|
| 34 |
+
_add_common_args(d)
|
| 35 |
+
|
| 36 |
e = sub.add_parser("eval", help="Run multiple episodes and show aggregate metrics.")
|
| 37 |
+
_add_common_args(e)
|
| 38 |
+
e.add_argument("--episodes", type=int, default=0)
|
| 39 |
+
|
| 40 |
+
b = sub.add_parser("benchmark", help="Run eval, update leaderboard, and export interactive dashboard.")
|
| 41 |
+
_add_common_args(b)
|
| 42 |
+
b.add_argument("--episodes", type=int, default=0)
|
| 43 |
+
b.add_argument("--name", type=str, default="")
|
| 44 |
+
b.add_argument("--leaderboard", type=str, default="")
|
| 45 |
+
b.add_argument("--dashboard", type=str, default="")
|
| 46 |
+
|
| 47 |
+
l = sub.add_parser("leaderboard", help="Print ranked benchmark leaderboard.")
|
| 48 |
+
_add_common_args(l)
|
| 49 |
+
l.add_argument("--leaderboard", type=str, default="")
|
| 50 |
+
l.add_argument("--top", type=int, default=20)
|
| 51 |
+
l.add_argument(
|
| 52 |
+
"--sort-by",
|
| 53 |
+
type=str,
|
| 54 |
+
default="leaderboard_score",
|
| 55 |
+
choices=[
|
| 56 |
+
"leaderboard_score",
|
| 57 |
+
"task_success_rate",
|
| 58 |
+
"avg_graph_f1",
|
| 59 |
+
"tool_efficiency",
|
| 60 |
+
"avg_reward",
|
| 61 |
+
"retrieval_signal",
|
| 62 |
+
"structural_signal",
|
| 63 |
+
"deanonymization_accuracy",
|
| 64 |
+
"spawn_signal",
|
| 65 |
+
],
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
s = sub.add_parser("benchmark-sweep", help="Run benchmark across multiple seeds and append all runs to leaderboard.")
|
| 69 |
+
_add_common_args(s)
|
| 70 |
+
s.add_argument("--episodes", type=int, default=0)
|
| 71 |
+
s.add_argument("--seeds", type=str, default="7,11,17,23,31")
|
| 72 |
+
s.add_argument("--name-prefix", type=str, default="sweep")
|
| 73 |
+
s.add_argument("--leaderboard", type=str, default="")
|
| 74 |
+
s.add_argument("--dashboard-dir", type=str, default="")
|
| 75 |
+
|
| 76 |
+
v = sub.add_parser("viz", help="Export an interactive graph/database explorer.")
|
| 77 |
+
_add_common_args(v)
|
| 78 |
+
v.add_argument("--output", type=str, default="artifacts/osint_explorer.html")
|
| 79 |
+
v.add_argument("--with-demo", action="store_true")
|
| 80 |
+
v.add_argument("--leaderboard", type=str, default="")
|
| 81 |
return parser
|
| 82 |
|
| 83 |
|
| 84 |
+
def _resolve_environment_config(args: argparse.Namespace) -> tuple[EnvironmentConfig, dict[str, str | int]]:
|
| 85 |
+
shared = load_shared_config(args.config)
|
| 86 |
+
env_cfg = clone_environment_config(shared.environment)
|
| 87 |
+
|
| 88 |
+
if args.seed_file:
|
| 89 |
+
env_cfg.seeding = load_seeding_config(args.seed_file)
|
| 90 |
+
|
| 91 |
+
if args.agent_mode == "single":
|
| 92 |
+
env_cfg.swarm.enabled = False
|
| 93 |
+
elif args.agent_mode == "swarm":
|
| 94 |
+
env_cfg.swarm.enabled = True
|
| 95 |
+
|
| 96 |
+
runtime = {
|
| 97 |
+
"default_episodes": shared.runtime.default_episodes,
|
| 98 |
+
"leaderboard_path": shared.runtime.leaderboard_path,
|
| 99 |
+
"dashboard_path": shared.runtime.dashboard_path,
|
| 100 |
+
"sweep_dashboard_dir": shared.runtime.sweep_dashboard_dir,
|
| 101 |
+
}
|
| 102 |
+
return env_cfg, runtime
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _runner_for(env: OSINTEnvironment) -> SingleAgentRunner | SwarmAgentRunner:
|
| 106 |
+
if env.config.swarm.enabled:
|
| 107 |
+
return SwarmAgentRunner(env)
|
| 108 |
+
return SingleAgentRunner(env)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
def main() -> None:
|
| 112 |
args = build_parser().parse_args()
|
| 113 |
+
env_cfg, runtime = _resolve_environment_config(args)
|
| 114 |
+
|
| 115 |
+
episodes = int(args.episodes) if getattr(args, "episodes", 0) else int(runtime["default_episodes"])
|
| 116 |
+
leaderboard_path = str(args.leaderboard) if getattr(args, "leaderboard", "") else str(runtime["leaderboard_path"])
|
| 117 |
+
dashboard_path = str(args.dashboard) if getattr(args, "dashboard", "") else str(runtime["dashboard_path"])
|
| 118 |
+
sweep_dashboard_dir = (
|
| 119 |
+
str(args.dashboard_dir) if getattr(args, "dashboard_dir", "") else str(runtime["sweep_dashboard_dir"])
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
if args.cmd == "leaderboard":
|
| 123 |
+
records = load_leaderboard(leaderboard_path)
|
| 124 |
+
print(render_leaderboard_table(records, top_k=args.top, sort_by=args.sort_by))
|
| 125 |
+
return
|
| 126 |
+
|
| 127 |
+
if args.cmd == "benchmark-sweep":
|
| 128 |
+
seed_values = [int(x.strip()) for x in args.seeds.split(",") if x.strip()]
|
| 129 |
+
outputs: list[dict[str, object]] = []
|
| 130 |
+
for seed in seed_values:
|
| 131 |
+
seeded_cfg = clone_environment_config(env_cfg)
|
| 132 |
+
seeded_cfg.seed = seed
|
| 133 |
+
env = OSINTEnvironment(seeded_cfg)
|
| 134 |
+
evaluation = run_evaluation(env, episodes=episodes, return_details=True)
|
| 135 |
+
summary = evaluation["summary"]
|
| 136 |
+
run_name = f"{args.name_prefix}_seed{seed}"
|
| 137 |
+
record = append_leaderboard_record(
|
| 138 |
+
path=leaderboard_path,
|
| 139 |
+
summary=summary,
|
| 140 |
+
episodes=episodes,
|
| 141 |
+
run_name=run_name,
|
| 142 |
+
config={
|
| 143 |
+
"seed": seed,
|
| 144 |
+
"max_steps": env.config.max_steps,
|
| 145 |
+
"swarm_enabled": env.config.swarm.enabled,
|
| 146 |
+
"max_agents": env.config.swarm.max_agents,
|
| 147 |
+
"max_breadth": env.config.swarm.max_breadth,
|
| 148 |
+
"max_width": env.config.swarm.max_width,
|
| 149 |
+
"max_depth": env.config.swarm.max_depth,
|
| 150 |
+
"seeded_questions": len(env.config.seeding.seeded_questions),
|
| 151 |
+
},
|
| 152 |
+
)
|
| 153 |
+
dashboard_path = export_dashboard(
|
| 154 |
+
env=env,
|
| 155 |
+
evaluation=evaluation,
|
| 156 |
+
leaderboard_records=load_leaderboard(leaderboard_path),
|
| 157 |
+
output_path=f"{sweep_dashboard_dir}/{run_name}.html",
|
| 158 |
+
)
|
| 159 |
+
outputs.append({"seed": seed, "record": record, "dashboard": dashboard_path, "summary": summary})
|
| 160 |
+
|
| 161 |
+
records = load_leaderboard(leaderboard_path)
|
| 162 |
+
print(
|
| 163 |
+
json.dumps(
|
| 164 |
+
{
|
| 165 |
+
"runs": outputs,
|
| 166 |
+
"leaderboard_preview": render_leaderboard_table(records, top_k=min(10, len(records))),
|
| 167 |
+
},
|
| 168 |
+
indent=2,
|
| 169 |
+
sort_keys=True,
|
| 170 |
+
)
|
| 171 |
+
)
|
| 172 |
+
return
|
| 173 |
+
|
| 174 |
+
env = OSINTEnvironment(env_cfg)
|
| 175 |
if args.cmd == "demo":
|
| 176 |
+
info = _runner_for(env).run_episode()
|
| 177 |
print(json.dumps(info, indent=2, sort_keys=True))
|
| 178 |
elif args.cmd == "eval":
|
| 179 |
+
metrics = run_evaluation(env, episodes=episodes)
|
| 180 |
print(json.dumps(metrics, indent=2, sort_keys=True))
|
| 181 |
+
elif args.cmd == "benchmark":
|
| 182 |
+
evaluation = run_evaluation(env, episodes=episodes, return_details=True)
|
| 183 |
+
summary = evaluation["summary"]
|
| 184 |
+
record = append_leaderboard_record(
|
| 185 |
+
path=leaderboard_path,
|
| 186 |
+
summary=summary,
|
| 187 |
+
episodes=episodes,
|
| 188 |
+
run_name=args.name or None,
|
| 189 |
+
config={
|
| 190 |
+
"seed": env.config.seed,
|
| 191 |
+
"max_steps": env.config.max_steps,
|
| 192 |
+
"swarm_enabled": env.config.swarm.enabled,
|
| 193 |
+
"max_agents": env.config.swarm.max_agents,
|
| 194 |
+
"max_breadth": env.config.swarm.max_breadth,
|
| 195 |
+
"max_width": env.config.swarm.max_width,
|
| 196 |
+
"max_depth": env.config.swarm.max_depth,
|
| 197 |
+
"seeded_questions": len(env.config.seeding.seeded_questions),
|
| 198 |
+
},
|
| 199 |
+
)
|
| 200 |
+
leaderboard = load_leaderboard(leaderboard_path)
|
| 201 |
+
dashboard_path = export_dashboard(
|
| 202 |
+
env=env,
|
| 203 |
+
evaluation=evaluation,
|
| 204 |
+
leaderboard_records=leaderboard,
|
| 205 |
+
output_path=dashboard_path,
|
| 206 |
+
)
|
| 207 |
+
payload = {
|
| 208 |
+
"record": record,
|
| 209 |
+
"summary": summary,
|
| 210 |
+
"dashboard": dashboard_path,
|
| 211 |
+
}
|
| 212 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 213 |
+
elif args.cmd == "viz":
|
| 214 |
+
if args.with_demo:
|
| 215 |
+
SingleAgentRunner(env).run_episode()
|
| 216 |
+
|
| 217 |
+
graph_f1 = 0.0
|
| 218 |
+
if env.state is not None:
|
| 219 |
+
graph_f1 = compute_graph_f1(env.memory_graph.edges, env.state.task.supporting_edges)
|
| 220 |
+
|
| 221 |
+
summary = {
|
| 222 |
+
"task_success_rate": 0.0,
|
| 223 |
+
"tool_efficiency": 0.0,
|
| 224 |
+
"avg_graph_f1": graph_f1,
|
| 225 |
+
"avg_steps_to_solution": float(env.state.step_count) if env.state else 0.0,
|
| 226 |
+
"deanonymization_accuracy": 0.0,
|
| 227 |
+
"avg_reward": float(env.state.total_reward) if env.state else 0.0,
|
| 228 |
+
"leaderboard_score": 0.0,
|
| 229 |
+
}
|
| 230 |
+
evaluation = {"summary": summary, "episodes": []}
|
| 231 |
+
leaderboard = load_leaderboard(leaderboard_path)
|
| 232 |
+
out = export_dashboard(env=env, evaluation=evaluation, leaderboard_records=leaderboard, output_path=args.output)
|
| 233 |
+
print(json.dumps({"dashboard": out}, indent=2, sort_keys=True))
|
| 234 |
|
| 235 |
|
| 236 |
if __name__ == "__main__":
|
src/osint_env/config/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.config.shared import RuntimeDefaults, SharedConfig, clone_environment_config, load_seeding_config, load_shared_config
|
| 2 |
+
|
| 3 |
+
__all__ = [
|
| 4 |
+
"RuntimeDefaults",
|
| 5 |
+
"SharedConfig",
|
| 6 |
+
"clone_environment_config",
|
| 7 |
+
"load_seeding_config",
|
| 8 |
+
"load_shared_config",
|
| 9 |
+
]
|
src/osint_env/config/shared.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import copy
|
| 4 |
+
import json
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from osint_env.domain.models import (
|
| 10 |
+
EnvironmentConfig,
|
| 11 |
+
NodeType,
|
| 12 |
+
SeedingConfig,
|
| 13 |
+
SeedEdgeSpec,
|
| 14 |
+
SeedNodeSpec,
|
| 15 |
+
SeedQuestionSpec,
|
| 16 |
+
SpawnRewardConfig,
|
| 17 |
+
SwarmConfig,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass(slots=True)
|
| 22 |
+
class RuntimeDefaults:
|
| 23 |
+
default_episodes: int = 20
|
| 24 |
+
leaderboard_path: str = "artifacts/leaderboard.json"
|
| 25 |
+
dashboard_path: str = "artifacts/osint_dashboard.html"
|
| 26 |
+
sweep_dashboard_dir: str = "artifacts/sweep_dashboards"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass(slots=True)
|
| 30 |
+
class SharedConfig:
|
| 31 |
+
environment: EnvironmentConfig = field(default_factory=EnvironmentConfig)
|
| 32 |
+
runtime: RuntimeDefaults = field(default_factory=RuntimeDefaults)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def clone_environment_config(config: EnvironmentConfig) -> EnvironmentConfig:
|
| 36 |
+
return copy.deepcopy(config)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _as_dict(value: Any) -> dict[str, Any]:
|
| 40 |
+
return value if isinstance(value, dict) else {}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _parse_int(value: Any, default: int) -> int:
|
| 44 |
+
try:
|
| 45 |
+
return int(value)
|
| 46 |
+
except (TypeError, ValueError):
|
| 47 |
+
return default
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _parse_float(value: Any, default: float) -> float:
|
| 51 |
+
try:
|
| 52 |
+
return float(value)
|
| 53 |
+
except (TypeError, ValueError):
|
| 54 |
+
return default
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _parse_bool(value: Any, default: bool) -> bool:
|
| 58 |
+
if isinstance(value, bool):
|
| 59 |
+
return value
|
| 60 |
+
if isinstance(value, str):
|
| 61 |
+
lowered = value.strip().lower()
|
| 62 |
+
if lowered in {"1", "true", "yes", "y", "on"}:
|
| 63 |
+
return True
|
| 64 |
+
if lowered in {"0", "false", "no", "n", "off"}:
|
| 65 |
+
return False
|
| 66 |
+
return default
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _infer_node_type(node_id: str) -> NodeType:
|
| 70 |
+
prefix = str(node_id).split("_", 1)[0].lower()
|
| 71 |
+
mapping = {
|
| 72 |
+
"user": NodeType.USER,
|
| 73 |
+
"alias": NodeType.ALIAS,
|
| 74 |
+
"org": NodeType.ORG,
|
| 75 |
+
"loc": NodeType.LOCATION,
|
| 76 |
+
"location": NodeType.LOCATION,
|
| 77 |
+
"post": NodeType.POST,
|
| 78 |
+
"thr": NodeType.THREAD,
|
| 79 |
+
"thread": NodeType.THREAD,
|
| 80 |
+
"event": NodeType.EVENT,
|
| 81 |
+
}
|
| 82 |
+
return mapping.get(prefix, NodeType.USER)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _parse_node_type(value: Any, node_id: str) -> NodeType:
|
| 86 |
+
if isinstance(value, NodeType):
|
| 87 |
+
return value
|
| 88 |
+
if isinstance(value, str):
|
| 89 |
+
raw = value.strip().lower()
|
| 90 |
+
try:
|
| 91 |
+
return NodeType(raw)
|
| 92 |
+
except ValueError:
|
| 93 |
+
return _infer_node_type(node_id)
|
| 94 |
+
return _infer_node_type(node_id)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def _parse_seed_edge(item: dict[str, Any]) -> SeedEdgeSpec | None:
|
| 98 |
+
src = str(item.get("src", "")).strip()
|
| 99 |
+
rel = str(item.get("rel", "")).strip()
|
| 100 |
+
dst = str(item.get("dst", "")).strip()
|
| 101 |
+
if not src or not rel or not dst:
|
| 102 |
+
return None
|
| 103 |
+
confidence = _parse_float(item.get("confidence", 1.0), 1.0)
|
| 104 |
+
return SeedEdgeSpec(src=src, rel=rel, dst=dst, confidence=confidence)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _parse_seeding(data: dict[str, Any]) -> SeedingConfig:
|
| 108 |
+
seeded_nodes: list[SeedNodeSpec] = []
|
| 109 |
+
for item in data.get("seeded_nodes", []):
|
| 110 |
+
row = _as_dict(item)
|
| 111 |
+
node_id = str(row.get("node_id", "")).strip()
|
| 112 |
+
if not node_id:
|
| 113 |
+
continue
|
| 114 |
+
node_type = _parse_node_type(row.get("node_type"), node_id)
|
| 115 |
+
attrs = _as_dict(row.get("attrs"))
|
| 116 |
+
seeded_nodes.append(SeedNodeSpec(node_id=node_id, node_type=node_type, attrs=attrs))
|
| 117 |
+
|
| 118 |
+
seeded_edges: list[SeedEdgeSpec] = []
|
| 119 |
+
for item in data.get("seeded_edges", []):
|
| 120 |
+
edge = _parse_seed_edge(_as_dict(item))
|
| 121 |
+
if edge is not None:
|
| 122 |
+
seeded_edges.append(edge)
|
| 123 |
+
|
| 124 |
+
seeded_questions: list[SeedQuestionSpec] = []
|
| 125 |
+
for item in data.get("seeded_questions", []):
|
| 126 |
+
row = _as_dict(item)
|
| 127 |
+
question = str(row.get("question", "")).strip()
|
| 128 |
+
if not question:
|
| 129 |
+
continue
|
| 130 |
+
answer_val = row.get("answer")
|
| 131 |
+
answer = str(answer_val).strip() if answer_val is not None and str(answer_val).strip() else None
|
| 132 |
+
task_type = str(row.get("task_type", "seeded")).strip() or "seeded"
|
| 133 |
+
support_edges: list[SeedEdgeSpec] = []
|
| 134 |
+
for edge_item in row.get("supporting_edges", []):
|
| 135 |
+
edge = _parse_seed_edge(_as_dict(edge_item))
|
| 136 |
+
if edge is not None:
|
| 137 |
+
support_edges.append(edge)
|
| 138 |
+
metadata = _as_dict(row.get("metadata"))
|
| 139 |
+
seeded_questions.append(
|
| 140 |
+
SeedQuestionSpec(
|
| 141 |
+
question=question,
|
| 142 |
+
answer=answer,
|
| 143 |
+
task_type=task_type,
|
| 144 |
+
supporting_edges=support_edges,
|
| 145 |
+
metadata=metadata,
|
| 146 |
+
)
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
return SeedingConfig(
|
| 150 |
+
seeded_nodes=seeded_nodes,
|
| 151 |
+
seeded_edges=seeded_edges,
|
| 152 |
+
seeded_questions=seeded_questions,
|
| 153 |
+
llm_generate_remaining_graph=_parse_bool(data.get("llm_generate_remaining_graph"), True),
|
| 154 |
+
llm_generate_remaining_tasks=_parse_bool(data.get("llm_generate_remaining_tasks"), True),
|
| 155 |
+
llm_generated_edge_budget=max(0, _parse_int(data.get("llm_generated_edge_budget"), 6)),
|
| 156 |
+
llm_generated_task_budget=max(0, _parse_int(data.get("llm_generated_task_budget"), 8)),
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def load_seeding_config(path: str | Path) -> SeedingConfig:
|
| 161 |
+
payload = json.loads(Path(path).read_text(encoding="utf-8"))
|
| 162 |
+
if not isinstance(payload, dict):
|
| 163 |
+
raise ValueError("Seed file must contain a JSON object.")
|
| 164 |
+
source = _as_dict(payload.get("seeding", payload))
|
| 165 |
+
return _parse_seeding(source)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def _parse_environment(payload: dict[str, Any]) -> EnvironmentConfig:
|
| 169 |
+
env_data = _as_dict(payload.get("environment", payload))
|
| 170 |
+
swarm_data = _as_dict(payload.get("swarm", env_data.get("swarm", {})))
|
| 171 |
+
spawn_data = _as_dict(payload.get("spawn_reward", env_data.get("spawn_reward", {})))
|
| 172 |
+
seeding_data = _as_dict(payload.get("seeding", env_data.get("seeding", {})))
|
| 173 |
+
|
| 174 |
+
env = EnvironmentConfig(
|
| 175 |
+
n_users=max(4, _parse_int(env_data.get("n_users"), 40)),
|
| 176 |
+
alias_density=max(0.0, min(1.0, _parse_float(env_data.get("alias_density"), 0.35))),
|
| 177 |
+
noise_level=max(0.0, min(1.0, _parse_float(env_data.get("noise_level"), 0.15))),
|
| 178 |
+
red_herring_rate=max(0.0, min(1.0, _parse_float(env_data.get("red_herring_rate"), 0.1))),
|
| 179 |
+
max_steps=max(2, _parse_int(env_data.get("max_steps"), 18)),
|
| 180 |
+
seed=_parse_int(env_data.get("seed"), 7),
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
env.swarm = SwarmConfig(
|
| 184 |
+
enabled=_parse_bool(swarm_data.get("enabled"), False),
|
| 185 |
+
max_agents=max(1, _parse_int(swarm_data.get("max_agents"), 3)),
|
| 186 |
+
max_breadth=max(1, _parse_int(swarm_data.get("max_breadth"), 2)),
|
| 187 |
+
max_width=max(1, _parse_int(swarm_data.get("max_width"), 2)),
|
| 188 |
+
max_depth=max(1, _parse_int(swarm_data.get("max_depth"), 2)),
|
| 189 |
+
planner_rounds=max(1, _parse_int(swarm_data.get("planner_rounds"), 2)),
|
| 190 |
+
tools_per_agent=max(1, _parse_int(swarm_data.get("tools_per_agent"), 1)),
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
env.spawn_reward = SpawnRewardConfig(
|
| 194 |
+
lambda_parallel=max(0.0, _parse_float(spawn_data.get("lambda_parallel"), 0.15)),
|
| 195 |
+
lambda_finish=max(0.0, _parse_float(spawn_data.get("lambda_finish"), 0.2)),
|
| 196 |
+
anneal=max(0.0, min(1.0, _parse_float(spawn_data.get("anneal"), 1.0))),
|
| 197 |
+
max_parallel_hint=max(1, _parse_int(spawn_data.get("max_parallel_hint"), 3)),
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
env.seeding = _parse_seeding(seeding_data)
|
| 201 |
+
return env
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def _parse_runtime(payload: dict[str, Any]) -> RuntimeDefaults:
|
| 205 |
+
runtime = _as_dict(payload.get("runtime", {}))
|
| 206 |
+
return RuntimeDefaults(
|
| 207 |
+
default_episodes=max(1, _parse_int(runtime.get("default_episodes"), 20)),
|
| 208 |
+
leaderboard_path=str(runtime.get("leaderboard_path", "artifacts/leaderboard.json")),
|
| 209 |
+
dashboard_path=str(runtime.get("dashboard_path", "artifacts/osint_dashboard.html")),
|
| 210 |
+
sweep_dashboard_dir=str(runtime.get("sweep_dashboard_dir", "artifacts/sweep_dashboards")),
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def load_shared_config(path: str | Path | None) -> SharedConfig:
|
| 215 |
+
if not path:
|
| 216 |
+
return SharedConfig()
|
| 217 |
+
|
| 218 |
+
file_path = Path(path)
|
| 219 |
+
if not file_path.exists():
|
| 220 |
+
return SharedConfig()
|
| 221 |
+
|
| 222 |
+
payload = json.loads(file_path.read_text(encoding="utf-8"))
|
| 223 |
+
if not isinstance(payload, dict):
|
| 224 |
+
raise ValueError("Shared config file must contain a JSON object.")
|
| 225 |
+
|
| 226 |
+
return SharedConfig(environment=_parse_environment(payload), runtime=_parse_runtime(payload))
|
src/osint_env/data/generator.py
CHANGED
|
@@ -1,9 +1,24 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
| 3 |
import random
|
|
|
|
| 4 |
from dataclasses import dataclass
|
|
|
|
| 5 |
|
| 6 |
-
from osint_env.domain.models import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
@dataclass(slots=True)
|
|
@@ -14,9 +29,335 @@ class PlatformViews:
|
|
| 14 |
|
| 15 |
|
| 16 |
class DatasetGenerator:
|
| 17 |
-
def __init__(self, config: EnvironmentConfig):
|
| 18 |
self.config = config
|
| 19 |
self.rng = random.Random(config.seed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def build_canonical_graph(self) -> CanonicalGraph:
|
| 22 |
graph = CanonicalGraph()
|
|
@@ -44,6 +385,14 @@ class DatasetGenerator:
|
|
| 44 |
for _ in range(max(1, self.config.n_users // 2)):
|
| 45 |
a, b = self.rng.sample(users, 2)
|
| 46 |
graph.edges.append(Edge(a.node_id, "connected_to", b.node_id, confidence=0.8))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
return graph
|
| 48 |
|
| 49 |
def build_platform_views(self, graph: CanonicalGraph) -> PlatformViews:
|
|
@@ -114,28 +463,17 @@ class DatasetGenerator:
|
|
| 114 |
return PlatformViews(microblog_posts, forum_threads, profiles)
|
| 115 |
|
| 116 |
def generate_tasks(self, graph: CanonicalGraph, views: PlatformViews, count: int = 12) -> list[TaskInstance]:
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
work_edges = [e for e in graph.edges if e.rel == "works_at"]
|
| 120 |
-
tasks: list[TaskInstance] = []
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
support = [edge]
|
| 134 |
-
else:
|
| 135 |
-
edge = self.rng.choice(work_edges)
|
| 136 |
-
org_name = graph.nodes[edge.dst].attrs["name"]
|
| 137 |
-
q = f"Which user works at {org_name}?"
|
| 138 |
-
a = edge.src
|
| 139 |
-
support = [edge]
|
| 140 |
-
tasks.append(TaskInstance(task_id=f"task_{i}", task_type=mode, question=q, answer=a, supporting_edges=support))
|
| 141 |
-
return tasks
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
import json
|
| 4 |
import random
|
| 5 |
+
import re
|
| 6 |
from dataclasses import dataclass
|
| 7 |
+
from typing import TYPE_CHECKING, Any
|
| 8 |
|
| 9 |
+
from osint_env.domain.models import (
|
| 10 |
+
CanonicalGraph,
|
| 11 |
+
Edge,
|
| 12 |
+
EnvironmentConfig,
|
| 13 |
+
Node,
|
| 14 |
+
NodeType,
|
| 15 |
+
SeedEdgeSpec,
|
| 16 |
+
SeedQuestionSpec,
|
| 17 |
+
TaskInstance,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
if TYPE_CHECKING:
|
| 21 |
+
from osint_env.llm.interface import LLMClient
|
| 22 |
|
| 23 |
|
| 24 |
@dataclass(slots=True)
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
class DatasetGenerator:
|
| 32 |
+
def __init__(self, config: EnvironmentConfig, llm: LLMClient | None = None):
|
| 33 |
self.config = config
|
| 34 |
self.rng = random.Random(config.seed)
|
| 35 |
+
self.llm = llm
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def _edge_key(edge: Edge) -> tuple[str, str, str]:
|
| 39 |
+
return (edge.src, edge.rel, edge.dst)
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def _infer_node_type(node_id: str) -> NodeType:
|
| 43 |
+
prefix = str(node_id).split("_", 1)[0].lower()
|
| 44 |
+
mapping = {
|
| 45 |
+
"user": NodeType.USER,
|
| 46 |
+
"alias": NodeType.ALIAS,
|
| 47 |
+
"org": NodeType.ORG,
|
| 48 |
+
"loc": NodeType.LOCATION,
|
| 49 |
+
"location": NodeType.LOCATION,
|
| 50 |
+
"post": NodeType.POST,
|
| 51 |
+
"thr": NodeType.THREAD,
|
| 52 |
+
"thread": NodeType.THREAD,
|
| 53 |
+
"event": NodeType.EVENT,
|
| 54 |
+
}
|
| 55 |
+
return mapping.get(prefix, NodeType.USER)
|
| 56 |
+
|
| 57 |
+
def _ensure_node(self, graph: CanonicalGraph, node_id: str) -> None:
|
| 58 |
+
if node_id in graph.nodes:
|
| 59 |
+
return
|
| 60 |
+
node_type = self._infer_node_type(node_id)
|
| 61 |
+
attrs: dict[str, Any] = {}
|
| 62 |
+
if node_type == NodeType.USER:
|
| 63 |
+
attrs = {"name": node_id, "org": "Unknown", "location": "Unknown"}
|
| 64 |
+
if node_type == NodeType.ALIAS:
|
| 65 |
+
attrs = {"handle": f"@{node_id}"}
|
| 66 |
+
graph.nodes[node_id] = Node(node_id=node_id, node_type=node_type, attrs=attrs)
|
| 67 |
+
|
| 68 |
+
def _add_edge_if_missing(self, graph: CanonicalGraph, edge: Edge) -> None:
|
| 69 |
+
key = self._edge_key(edge)
|
| 70 |
+
if any(self._edge_key(existing) == key for existing in graph.edges):
|
| 71 |
+
return
|
| 72 |
+
self._ensure_node(graph, edge.src)
|
| 73 |
+
self._ensure_node(graph, edge.dst)
|
| 74 |
+
graph.edges.append(edge)
|
| 75 |
+
|
| 76 |
+
@staticmethod
|
| 77 |
+
def _extract_json_blob(text: str) -> Any:
|
| 78 |
+
text = str(text).strip()
|
| 79 |
+
if not text:
|
| 80 |
+
return None
|
| 81 |
+
for start, end in (("{", "}"), ("[", "]")):
|
| 82 |
+
left = text.find(start)
|
| 83 |
+
right = text.rfind(end)
|
| 84 |
+
if left >= 0 and right > left:
|
| 85 |
+
snippet = text[left : right + 1]
|
| 86 |
+
try:
|
| 87 |
+
return json.loads(snippet)
|
| 88 |
+
except json.JSONDecodeError:
|
| 89 |
+
continue
|
| 90 |
+
return None
|
| 91 |
+
|
| 92 |
+
def _apply_seed_nodes(self, graph: CanonicalGraph) -> None:
|
| 93 |
+
for node_spec in self.config.seeding.seeded_nodes:
|
| 94 |
+
node_type = (
|
| 95 |
+
node_spec.node_type
|
| 96 |
+
if isinstance(node_spec.node_type, NodeType)
|
| 97 |
+
else self._infer_node_type(node_spec.node_id)
|
| 98 |
+
)
|
| 99 |
+
existing = graph.nodes.get(node_spec.node_id)
|
| 100 |
+
attrs = dict(existing.attrs) if existing else {}
|
| 101 |
+
attrs.update(node_spec.attrs)
|
| 102 |
+
graph.nodes[node_spec.node_id] = Node(node_spec.node_id, node_type, attrs)
|
| 103 |
+
|
| 104 |
+
def _apply_seed_edges(self, graph: CanonicalGraph) -> None:
|
| 105 |
+
for edge_spec in self.config.seeding.seeded_edges:
|
| 106 |
+
self._add_edge_if_missing(
|
| 107 |
+
graph,
|
| 108 |
+
Edge(
|
| 109 |
+
src=edge_spec.src,
|
| 110 |
+
rel=edge_spec.rel,
|
| 111 |
+
dst=edge_spec.dst,
|
| 112 |
+
confidence=float(edge_spec.confidence),
|
| 113 |
+
),
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
@staticmethod
|
| 117 |
+
def _normalize_edge_candidates(value: Any) -> list[SeedEdgeSpec]:
|
| 118 |
+
items: list[SeedEdgeSpec] = []
|
| 119 |
+
if not isinstance(value, list):
|
| 120 |
+
return items
|
| 121 |
+
for row in value:
|
| 122 |
+
if not isinstance(row, dict):
|
| 123 |
+
continue
|
| 124 |
+
src = str(row.get("src", "")).strip()
|
| 125 |
+
rel = str(row.get("rel", "")).strip()
|
| 126 |
+
dst = str(row.get("dst", "")).strip()
|
| 127 |
+
if not src or not rel or not dst:
|
| 128 |
+
continue
|
| 129 |
+
try:
|
| 130 |
+
confidence = float(row.get("confidence", 1.0))
|
| 131 |
+
except (TypeError, ValueError):
|
| 132 |
+
confidence = 1.0
|
| 133 |
+
items.append(SeedEdgeSpec(src=src, rel=rel, dst=dst, confidence=confidence))
|
| 134 |
+
return items
|
| 135 |
+
|
| 136 |
+
def _template_generated_edges(self, graph: CanonicalGraph, budget: int) -> list[Edge]:
|
| 137 |
+
if budget <= 0:
|
| 138 |
+
return []
|
| 139 |
+
users = [n.node_id for n in graph.nodes.values() if n.node_type == NodeType.USER]
|
| 140 |
+
aliases = [n.node_id for n in graph.nodes.values() if n.node_type == NodeType.ALIAS]
|
| 141 |
+
if len(users) < 2:
|
| 142 |
+
return []
|
| 143 |
+
|
| 144 |
+
generated: list[Edge] = []
|
| 145 |
+
rels = ["connected_to", "mentions", "co_occurs_with"]
|
| 146 |
+
for _ in range(budget * 3):
|
| 147 |
+
if len(generated) >= budget:
|
| 148 |
+
break
|
| 149 |
+
roll = self.rng.random()
|
| 150 |
+
if aliases and roll < 0.2:
|
| 151 |
+
src = self.rng.choice(aliases)
|
| 152 |
+
dst = self.rng.choice(users)
|
| 153 |
+
rel = "alias_of"
|
| 154 |
+
elif roll < 0.75:
|
| 155 |
+
src, dst = self.rng.sample(users, 2)
|
| 156 |
+
rel = self.rng.choice(rels)
|
| 157 |
+
else:
|
| 158 |
+
src = self.rng.choice(users)
|
| 159 |
+
dst = self.rng.choice([u for u in users if u != src])
|
| 160 |
+
rel = "connected_to"
|
| 161 |
+
generated.append(Edge(src=src, rel=rel, dst=dst, confidence=0.7))
|
| 162 |
+
return generated[:budget]
|
| 163 |
+
|
| 164 |
+
def _llm_expand_graph(self, graph: CanonicalGraph, budget: int) -> list[Edge]:
|
| 165 |
+
if budget <= 0:
|
| 166 |
+
return []
|
| 167 |
+
|
| 168 |
+
if self.llm is None:
|
| 169 |
+
return self._template_generated_edges(graph, budget)
|
| 170 |
+
|
| 171 |
+
sample_edges = [
|
| 172 |
+
{"src": edge.src, "rel": edge.rel, "dst": edge.dst}
|
| 173 |
+
for edge in graph.edges[: min(40, len(graph.edges))]
|
| 174 |
+
]
|
| 175 |
+
sample_nodes = sorted(graph.nodes.keys())[:80]
|
| 176 |
+
prompt = (
|
| 177 |
+
"SEED_GRAPH_EXPANSION\n"
|
| 178 |
+
"Generate additional plausible graph edges to improve retrieval for OSINT tasks.\n"
|
| 179 |
+
"Return STRICT JSON object: {\"edges\": [{\"src\": str, \"rel\": str, \"dst\": str, \"confidence\": float}]}.\n"
|
| 180 |
+
"Use only known node ids when possible. Avoid duplicates.\n"
|
| 181 |
+
f"Budget: {budget}\n"
|
| 182 |
+
f"Known nodes: {json.dumps(sample_nodes)}\n"
|
| 183 |
+
f"Known edges sample: {json.dumps(sample_edges)}"
|
| 184 |
+
)
|
| 185 |
+
response = self.llm.generate([{"role": "system", "content": prompt}], tools=[])
|
| 186 |
+
parsed = self._extract_json_blob(response.content)
|
| 187 |
+
if isinstance(parsed, dict):
|
| 188 |
+
edges = self._normalize_edge_candidates(parsed.get("edges"))
|
| 189 |
+
if edges:
|
| 190 |
+
return [
|
| 191 |
+
Edge(src=e.src, rel=e.rel, dst=e.dst, confidence=float(e.confidence))
|
| 192 |
+
for e in edges[:budget]
|
| 193 |
+
]
|
| 194 |
+
return self._template_generated_edges(graph, budget)
|
| 195 |
+
|
| 196 |
+
@staticmethod
|
| 197 |
+
def _extract_entity_tokens(question: str) -> list[str]:
|
| 198 |
+
return re.findall(r"\b(?:alias|user|org|loc|post|thr|thread|event)_[a-zA-Z0-9_]+\b", question)
|
| 199 |
+
|
| 200 |
+
def _infer_answer_from_question(self, question: str, graph: CanonicalGraph) -> str:
|
| 201 |
+
entities = self._extract_entity_tokens(question)
|
| 202 |
+
question_l = question.lower()
|
| 203 |
+
|
| 204 |
+
alias_tokens = [token for token in entities if token.startswith("alias_")]
|
| 205 |
+
if alias_tokens:
|
| 206 |
+
alias = alias_tokens[0]
|
| 207 |
+
for edge in graph.edges:
|
| 208 |
+
if edge.rel == "alias_of" and edge.src == alias:
|
| 209 |
+
return edge.dst
|
| 210 |
+
|
| 211 |
+
if "connected" in question_l:
|
| 212 |
+
user_tokens = [token for token in entities if token.startswith("user_")]
|
| 213 |
+
if user_tokens:
|
| 214 |
+
source = user_tokens[0]
|
| 215 |
+
for edge in graph.edges:
|
| 216 |
+
if edge.rel == "connected_to" and edge.src == source:
|
| 217 |
+
return edge.dst
|
| 218 |
+
|
| 219 |
+
if "works at" in question_l:
|
| 220 |
+
for edge in graph.edges:
|
| 221 |
+
if edge.rel != "works_at":
|
| 222 |
+
continue
|
| 223 |
+
org = graph.nodes.get(edge.dst)
|
| 224 |
+
org_name = str((org.attrs or {}).get("name", "")).lower() if org else ""
|
| 225 |
+
if org_name and org_name in question_l:
|
| 226 |
+
return edge.src
|
| 227 |
+
|
| 228 |
+
return entities[0] if entities else "unknown"
|
| 229 |
+
|
| 230 |
+
def _infer_support_edges(self, question: str, answer: str, graph: CanonicalGraph) -> list[Edge]:
|
| 231 |
+
if answer:
|
| 232 |
+
for edge in graph.edges:
|
| 233 |
+
if edge.dst == answer or edge.src == answer:
|
| 234 |
+
if edge.src in question or edge.dst in question or edge.rel in question.lower():
|
| 235 |
+
return [edge]
|
| 236 |
+
|
| 237 |
+
entities = self._extract_entity_tokens(question)
|
| 238 |
+
for edge in graph.edges:
|
| 239 |
+
if edge.src in entities or edge.dst in entities:
|
| 240 |
+
return [edge]
|
| 241 |
+
return []
|
| 242 |
+
|
| 243 |
+
def _seeded_tasks(self, graph: CanonicalGraph) -> list[TaskInstance]:
|
| 244 |
+
tasks: list[TaskInstance] = []
|
| 245 |
+
for idx, question_spec in enumerate(self.config.seeding.seeded_questions):
|
| 246 |
+
answer = question_spec.answer or self._infer_answer_from_question(question_spec.question, graph)
|
| 247 |
+
if question_spec.supporting_edges:
|
| 248 |
+
support = [
|
| 249 |
+
Edge(src=e.src, rel=e.rel, dst=e.dst, confidence=float(e.confidence))
|
| 250 |
+
for e in question_spec.supporting_edges
|
| 251 |
+
]
|
| 252 |
+
else:
|
| 253 |
+
support = self._infer_support_edges(question_spec.question, answer, graph)
|
| 254 |
+
|
| 255 |
+
tasks.append(
|
| 256 |
+
TaskInstance(
|
| 257 |
+
task_id=f"seed_task_{idx}",
|
| 258 |
+
task_type=question_spec.task_type,
|
| 259 |
+
question=question_spec.question,
|
| 260 |
+
answer=answer,
|
| 261 |
+
supporting_edges=support,
|
| 262 |
+
metadata=dict(question_spec.metadata),
|
| 263 |
+
)
|
| 264 |
+
)
|
| 265 |
+
return tasks
|
| 266 |
+
|
| 267 |
+
def _template_tasks(self, graph: CanonicalGraph, count: int, start_idx: int = 0) -> list[TaskInstance]:
|
| 268 |
+
alias_edges = [e for e in graph.edges if e.rel == "alias_of"]
|
| 269 |
+
conn_edges = [e for e in graph.edges if e.rel == "connected_to"]
|
| 270 |
+
work_edges = [e for e in graph.edges if e.rel == "works_at"]
|
| 271 |
+
tasks: list[TaskInstance] = []
|
| 272 |
+
|
| 273 |
+
for i in range(count):
|
| 274 |
+
mode = self.rng.choice(["identity_resolution", "network_discovery", "event_tracing"])
|
| 275 |
+
if mode == "identity_resolution" and alias_edges:
|
| 276 |
+
edge = self.rng.choice(alias_edges)
|
| 277 |
+
q = f"Which canonical user owns alias {edge.src}?"
|
| 278 |
+
a = edge.dst
|
| 279 |
+
support = [edge]
|
| 280 |
+
elif mode == "network_discovery" and conn_edges:
|
| 281 |
+
edge = self.rng.choice(conn_edges)
|
| 282 |
+
q = f"Who is connected to {edge.src}?"
|
| 283 |
+
a = edge.dst
|
| 284 |
+
support = [edge]
|
| 285 |
+
else:
|
| 286 |
+
edge = self.rng.choice(work_edges)
|
| 287 |
+
org_node = graph.nodes.get(edge.dst)
|
| 288 |
+
org_name = (org_node.attrs or {}).get("name", edge.dst) if org_node else edge.dst
|
| 289 |
+
q = f"Which user works at {org_name}?"
|
| 290 |
+
a = edge.src
|
| 291 |
+
support = [edge]
|
| 292 |
+
tasks.append(
|
| 293 |
+
TaskInstance(
|
| 294 |
+
task_id=f"task_{start_idx + i}",
|
| 295 |
+
task_type=mode,
|
| 296 |
+
question=q,
|
| 297 |
+
answer=a,
|
| 298 |
+
supporting_edges=support,
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
return tasks
|
| 302 |
+
|
| 303 |
+
def _llm_generated_tasks(self, graph: CanonicalGraph, count: int, start_idx: int) -> list[TaskInstance]:
|
| 304 |
+
if count <= 0:
|
| 305 |
+
return []
|
| 306 |
+
if self.llm is None:
|
| 307 |
+
return self._template_tasks(graph, count=count, start_idx=start_idx)
|
| 308 |
+
|
| 309 |
+
candidate_edges = [
|
| 310 |
+
{"src": edge.src, "rel": edge.rel, "dst": edge.dst}
|
| 311 |
+
for edge in graph.edges
|
| 312 |
+
if edge.rel in {"alias_of", "connected_to", "works_at"}
|
| 313 |
+
][:60]
|
| 314 |
+
prompt = (
|
| 315 |
+
"SEED_TASK_EXPANSION\n"
|
| 316 |
+
"Generate additional OSINT QA tasks from this graph sample.\n"
|
| 317 |
+
"Return STRICT JSON object: {\"tasks\": [{\"task_type\": str, \"question\": str, \"answer\": str, \"supporting_edges\": [{\"src\": str, \"rel\": str, \"dst\": str}]}]}.\n"
|
| 318 |
+
f"Task budget: {count}\n"
|
| 319 |
+
f"Edge sample: {json.dumps(candidate_edges)}"
|
| 320 |
+
)
|
| 321 |
+
response = self.llm.generate([{"role": "system", "content": prompt}], tools=[])
|
| 322 |
+
parsed = self._extract_json_blob(response.content)
|
| 323 |
+
|
| 324 |
+
llm_tasks: list[TaskInstance] = []
|
| 325 |
+
if isinstance(parsed, dict) and isinstance(parsed.get("tasks"), list):
|
| 326 |
+
for i, row in enumerate(parsed["tasks"]):
|
| 327 |
+
if not isinstance(row, dict):
|
| 328 |
+
continue
|
| 329 |
+
question = str(row.get("question", "")).strip()
|
| 330 |
+
if not question:
|
| 331 |
+
continue
|
| 332 |
+
answer = str(row.get("answer", "")).strip() or self._infer_answer_from_question(question, graph)
|
| 333 |
+
task_type = str(row.get("task_type", "llm_generated")).strip() or "llm_generated"
|
| 334 |
+
support_specs = self._normalize_edge_candidates(row.get("supporting_edges"))
|
| 335 |
+
if support_specs:
|
| 336 |
+
support = [Edge(e.src, e.rel, e.dst, e.confidence) for e in support_specs]
|
| 337 |
+
else:
|
| 338 |
+
support = self._infer_support_edges(question, answer, graph)
|
| 339 |
+
llm_tasks.append(
|
| 340 |
+
TaskInstance(
|
| 341 |
+
task_id=f"task_{start_idx + i}",
|
| 342 |
+
task_type=task_type,
|
| 343 |
+
question=question,
|
| 344 |
+
answer=answer,
|
| 345 |
+
supporting_edges=support,
|
| 346 |
+
metadata={"generated_by": "llm"},
|
| 347 |
+
)
|
| 348 |
+
)
|
| 349 |
+
if len(llm_tasks) >= count:
|
| 350 |
+
break
|
| 351 |
+
|
| 352 |
+
if len(llm_tasks) < count:
|
| 353 |
+
llm_tasks.extend(
|
| 354 |
+
self._template_tasks(
|
| 355 |
+
graph,
|
| 356 |
+
count=count - len(llm_tasks),
|
| 357 |
+
start_idx=start_idx + len(llm_tasks),
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
return llm_tasks[:count]
|
| 361 |
|
| 362 |
def build_canonical_graph(self) -> CanonicalGraph:
|
| 363 |
graph = CanonicalGraph()
|
|
|
|
| 385 |
for _ in range(max(1, self.config.n_users // 2)):
|
| 386 |
a, b = self.rng.sample(users, 2)
|
| 387 |
graph.edges.append(Edge(a.node_id, "connected_to", b.node_id, confidence=0.8))
|
| 388 |
+
|
| 389 |
+
self._apply_seed_nodes(graph)
|
| 390 |
+
self._apply_seed_edges(graph)
|
| 391 |
+
|
| 392 |
+
if self.config.seeding.llm_generate_remaining_graph:
|
| 393 |
+
llm_edges = self._llm_expand_graph(graph, self.config.seeding.llm_generated_edge_budget)
|
| 394 |
+
for edge in llm_edges:
|
| 395 |
+
self._add_edge_if_missing(graph, edge)
|
| 396 |
return graph
|
| 397 |
|
| 398 |
def build_platform_views(self, graph: CanonicalGraph) -> PlatformViews:
|
|
|
|
| 463 |
return PlatformViews(microblog_posts, forum_threads, profiles)
|
| 464 |
|
| 465 |
def generate_tasks(self, graph: CanonicalGraph, views: PlatformViews, count: int = 12) -> list[TaskInstance]:
|
| 466 |
+
tasks = self._seeded_tasks(graph)
|
| 467 |
+
target_count = max(count, len(tasks))
|
|
|
|
|
|
|
| 468 |
|
| 469 |
+
llm_budget = min(
|
| 470 |
+
max(0, self.config.seeding.llm_generated_task_budget),
|
| 471 |
+
max(0, target_count - len(tasks)),
|
| 472 |
+
)
|
| 473 |
+
if self.config.seeding.llm_generate_remaining_tasks and llm_budget > 0:
|
| 474 |
+
tasks.extend(self._llm_generated_tasks(graph, count=llm_budget, start_idx=len(tasks)))
|
| 475 |
+
|
| 476 |
+
if len(tasks) < target_count:
|
| 477 |
+
tasks.extend(self._template_tasks(graph, count=target_count - len(tasks), start_idx=len(tasks)))
|
| 478 |
+
|
| 479 |
+
return tasks[:target_count]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/osint_env/domain/models.py
CHANGED
|
@@ -72,6 +72,60 @@ class TaskInstance:
|
|
| 72 |
metadata: dict[str, Any] = field(default_factory=dict)
|
| 73 |
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
@dataclass(slots=True)
|
| 76 |
class EnvironmentConfig:
|
| 77 |
n_users: int = 40
|
|
@@ -80,3 +134,6 @@ class EnvironmentConfig:
|
|
| 80 |
red_herring_rate: float = 0.1
|
| 81 |
max_steps: int = 18
|
| 82 |
seed: int = 7
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
metadata: dict[str, Any] = field(default_factory=dict)
|
| 73 |
|
| 74 |
|
| 75 |
+
@dataclass(slots=True)
|
| 76 |
+
class SeedNodeSpec:
|
| 77 |
+
node_id: str
|
| 78 |
+
node_type: NodeType | str
|
| 79 |
+
attrs: dict[str, Any] = field(default_factory=dict)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@dataclass(slots=True)
|
| 83 |
+
class SeedEdgeSpec:
|
| 84 |
+
src: str
|
| 85 |
+
rel: str
|
| 86 |
+
dst: str
|
| 87 |
+
confidence: float = 1.0
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@dataclass(slots=True)
|
| 91 |
+
class SeedQuestionSpec:
|
| 92 |
+
question: str
|
| 93 |
+
answer: str | None = None
|
| 94 |
+
task_type: str = "seeded"
|
| 95 |
+
supporting_edges: list[SeedEdgeSpec] = field(default_factory=list)
|
| 96 |
+
metadata: dict[str, Any] = field(default_factory=dict)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@dataclass(slots=True)
|
| 100 |
+
class SeedingConfig:
|
| 101 |
+
seeded_nodes: list[SeedNodeSpec] = field(default_factory=list)
|
| 102 |
+
seeded_edges: list[SeedEdgeSpec] = field(default_factory=list)
|
| 103 |
+
seeded_questions: list[SeedQuestionSpec] = field(default_factory=list)
|
| 104 |
+
llm_generate_remaining_graph: bool = True
|
| 105 |
+
llm_generate_remaining_tasks: bool = True
|
| 106 |
+
llm_generated_edge_budget: int = 6
|
| 107 |
+
llm_generated_task_budget: int = 8
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@dataclass(slots=True)
|
| 111 |
+
class SwarmConfig:
|
| 112 |
+
enabled: bool = False
|
| 113 |
+
max_agents: int = 3
|
| 114 |
+
max_breadth: int = 2
|
| 115 |
+
max_width: int = 2
|
| 116 |
+
max_depth: int = 2
|
| 117 |
+
planner_rounds: int = 2
|
| 118 |
+
tools_per_agent: int = 1
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
@dataclass(slots=True)
|
| 122 |
+
class SpawnRewardConfig:
|
| 123 |
+
lambda_parallel: float = 0.15
|
| 124 |
+
lambda_finish: float = 0.20
|
| 125 |
+
anneal: float = 1.0
|
| 126 |
+
max_parallel_hint: int = 3
|
| 127 |
+
|
| 128 |
+
|
| 129 |
@dataclass(slots=True)
|
| 130 |
class EnvironmentConfig:
|
| 131 |
n_users: int = 40
|
|
|
|
| 134 |
red_herring_rate: float = 0.1
|
| 135 |
max_steps: int = 18
|
| 136 |
seed: int = 7
|
| 137 |
+
seeding: SeedingConfig = field(default_factory=SeedingConfig)
|
| 138 |
+
swarm: SwarmConfig = field(default_factory=SwarmConfig)
|
| 139 |
+
spawn_reward: SpawnRewardConfig = field(default_factory=SpawnRewardConfig)
|
src/osint_env/env/environment.py
CHANGED
|
@@ -1,16 +1,24 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
-
from typing import Any
|
| 5 |
|
| 6 |
from openenv.env import Env
|
| 7 |
|
| 8 |
from osint_env.data.generator import DatasetGenerator
|
| 9 |
from osint_env.domain.models import Action, ActionType, Edge, EnvironmentConfig, Observation, TaskInstance
|
| 10 |
-
from osint_env.env.reward import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from osint_env.memory.store import MemoryGraph, SemanticMemory
|
| 12 |
from osint_env.platforms.tools import ToolRegistry
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
@dataclass(slots=True)
|
| 16 |
class EpisodeState:
|
|
@@ -24,10 +32,11 @@ class EpisodeState:
|
|
| 24 |
tool_outputs: list[dict[str, Any]] = field(default_factory=list)
|
| 25 |
answer: str | None = None
|
| 26 |
call_fingerprints: set[str] = field(default_factory=set)
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
class OSINTEnvironment(Env):
|
| 30 |
-
def __init__(self, config: EnvironmentConfig):
|
| 31 |
super().__init__(
|
| 32 |
name="OSINTEnvironment",
|
| 33 |
state_space="json-observation",
|
|
@@ -35,10 +44,11 @@ class OSINTEnvironment(Env):
|
|
| 35 |
episode_max_length=config.max_steps,
|
| 36 |
)
|
| 37 |
self.config = config
|
| 38 |
-
self.generator = DatasetGenerator(config)
|
| 39 |
self.graph = self.generator.build_canonical_graph()
|
| 40 |
self.views = self.generator.build_platform_views(self.graph)
|
| 41 |
self.tasks = self.generator.generate_tasks(self.graph, self.views, count=24)
|
|
|
|
| 42 |
self.tools = ToolRegistry(self.views)
|
| 43 |
self.memory_graph = MemoryGraph()
|
| 44 |
self.semantic_memory = SemanticMemory()
|
|
@@ -96,16 +106,36 @@ class OSINTEnvironment(Env):
|
|
| 96 |
output = self.tools.call(tool_name, args)
|
| 97 |
self.state.tool_outputs.append({"tool": tool_name, "args": args, "output": output})
|
| 98 |
self.semantic_memory.add(f"{tool_name} {args} {output}", {"tool": tool_name})
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
def _handle_add_edge(self, payload: dict[str, Any]) -> float:
|
| 102 |
if self.state is None:
|
| 103 |
return 0.0
|
| 104 |
edge = Edge(payload["src"], payload["rel"], payload["dst"], float(payload.get("confidence", 1.0)))
|
|
|
|
| 105 |
added = self.memory_graph.add_edge(edge)
|
| 106 |
if not added:
|
|
|
|
| 107 |
return -0.15
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
def _handle_answer(self, payload: dict[str, Any]) -> float:
|
| 111 |
if self.state is None:
|
|
@@ -113,9 +143,34 @@ class OSINTEnvironment(Env):
|
|
| 113 |
proposed = str(payload.get("answer", "")).strip()
|
| 114 |
self.state.answer = proposed
|
| 115 |
self.state.done = True
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
def _observation(self) -> Observation:
|
| 121 |
if self.state is None:
|
|
@@ -137,4 +192,6 @@ class OSINTEnvironment(Env):
|
|
| 137 |
"redundant_tool_calls": self.state.redundant_tool_calls,
|
| 138 |
"task_answer": self.state.task.answer,
|
| 139 |
"agent_answer": self.state.answer,
|
|
|
|
|
|
|
| 140 |
}
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
+
from typing import TYPE_CHECKING, Any
|
| 5 |
|
| 6 |
from openenv.env import Env
|
| 7 |
|
| 8 |
from osint_env.data.generator import DatasetGenerator
|
| 9 |
from osint_env.domain.models import Action, ActionType, Edge, EnvironmentConfig, Observation, TaskInstance
|
| 10 |
+
from osint_env.env.reward import (
|
| 11 |
+
build_reward_model,
|
| 12 |
+
compute_answer_reward,
|
| 13 |
+
compute_edge_reward,
|
| 14 |
+
compute_graph_f1,
|
| 15 |
+
)
|
| 16 |
from osint_env.memory.store import MemoryGraph, SemanticMemory
|
| 17 |
from osint_env.platforms.tools import ToolRegistry
|
| 18 |
|
| 19 |
+
if TYPE_CHECKING:
|
| 20 |
+
from osint_env.llm.interface import LLMClient
|
| 21 |
+
|
| 22 |
|
| 23 |
@dataclass(slots=True)
|
| 24 |
class EpisodeState:
|
|
|
|
| 32 |
tool_outputs: list[dict[str, Any]] = field(default_factory=list)
|
| 33 |
answer: str | None = None
|
| 34 |
call_fingerprints: set[str] = field(default_factory=set)
|
| 35 |
+
reward_components: dict[str, float] = field(default_factory=dict)
|
| 36 |
|
| 37 |
|
| 38 |
class OSINTEnvironment(Env):
|
| 39 |
+
def __init__(self, config: EnvironmentConfig, llm: "LLMClient | None" = None):
|
| 40 |
super().__init__(
|
| 41 |
name="OSINTEnvironment",
|
| 42 |
state_space="json-observation",
|
|
|
|
| 44 |
episode_max_length=config.max_steps,
|
| 45 |
)
|
| 46 |
self.config = config
|
| 47 |
+
self.generator = DatasetGenerator(config, llm=llm)
|
| 48 |
self.graph = self.generator.build_canonical_graph()
|
| 49 |
self.views = self.generator.build_platform_views(self.graph)
|
| 50 |
self.tasks = self.generator.generate_tasks(self.graph, self.views, count=24)
|
| 51 |
+
self.reward_model = build_reward_model(self.graph)
|
| 52 |
self.tools = ToolRegistry(self.views)
|
| 53 |
self.memory_graph = MemoryGraph()
|
| 54 |
self.semantic_memory = SemanticMemory()
|
|
|
|
| 106 |
output = self.tools.call(tool_name, args)
|
| 107 |
self.state.tool_outputs.append({"tool": tool_name, "args": args, "output": output})
|
| 108 |
self.semantic_memory.add(f"{tool_name} {args} {output}", {"tool": tool_name})
|
| 109 |
+
relevance_bonus = 0.08 * self._tool_relevance(self.state.task, output)
|
| 110 |
+
total = penalty + relevance_bonus
|
| 111 |
+
self._accumulate_reward_components(
|
| 112 |
+
{
|
| 113 |
+
"tool_novelty": penalty,
|
| 114 |
+
"tool_relevance": relevance_bonus,
|
| 115 |
+
}
|
| 116 |
+
)
|
| 117 |
+
return total
|
| 118 |
|
| 119 |
def _handle_add_edge(self, payload: dict[str, Any]) -> float:
|
| 120 |
if self.state is None:
|
| 121 |
return 0.0
|
| 122 |
edge = Edge(payload["src"], payload["rel"], payload["dst"], float(payload.get("confidence", 1.0)))
|
| 123 |
+
existing_edges = list(self.memory_graph.edges)
|
| 124 |
added = self.memory_graph.add_edge(edge)
|
| 125 |
if not added:
|
| 126 |
+
self._accumulate_reward_components({"duplicate_edge_penalty": -0.15})
|
| 127 |
return -0.15
|
| 128 |
+
|
| 129 |
+
breakdown = compute_edge_reward(
|
| 130 |
+
edge=edge,
|
| 131 |
+
task=self.state.task,
|
| 132 |
+
existing_edges=existing_edges,
|
| 133 |
+
step_count=self.state.step_count,
|
| 134 |
+
model=self.reward_model,
|
| 135 |
+
graph=self.graph,
|
| 136 |
+
)
|
| 137 |
+
self._accumulate_reward_components(breakdown.to_dict())
|
| 138 |
+
return breakdown.total
|
| 139 |
|
| 140 |
def _handle_answer(self, payload: dict[str, Any]) -> float:
|
| 141 |
if self.state is None:
|
|
|
|
| 143 |
proposed = str(payload.get("answer", "")).strip()
|
| 144 |
self.state.answer = proposed
|
| 145 |
self.state.done = True
|
| 146 |
+
breakdown = compute_answer_reward(
|
| 147 |
+
proposed_answer=proposed,
|
| 148 |
+
task=self.state.task,
|
| 149 |
+
pred_edges=self.memory_graph.edges,
|
| 150 |
+
tool_outputs=self.state.tool_outputs,
|
| 151 |
+
step_count=self.state.step_count,
|
| 152 |
+
model=self.reward_model,
|
| 153 |
+
)
|
| 154 |
+
self._accumulate_reward_components(breakdown.to_dict())
|
| 155 |
+
return breakdown.total
|
| 156 |
+
|
| 157 |
+
def _tool_relevance(self, task: TaskInstance, output: dict[str, Any]) -> float:
|
| 158 |
+
haystack = str(output).lower()
|
| 159 |
+
clues = {task.answer.lower()}
|
| 160 |
+
for edge in task.supporting_edges:
|
| 161 |
+
clues.add(edge.src.lower())
|
| 162 |
+
clues.add(edge.dst.lower())
|
| 163 |
+
clues.add(edge.rel.lower())
|
| 164 |
+
if not clues:
|
| 165 |
+
return 0.0
|
| 166 |
+
matches = sum(1 for token in clues if token in haystack)
|
| 167 |
+
return matches / len(clues)
|
| 168 |
+
|
| 169 |
+
def _accumulate_reward_components(self, values: dict[str, float]) -> None:
|
| 170 |
+
if self.state is None:
|
| 171 |
+
return
|
| 172 |
+
for key, value in values.items():
|
| 173 |
+
self.state.reward_components[key] = self.state.reward_components.get(key, 0.0) + float(value)
|
| 174 |
|
| 175 |
def _observation(self) -> Observation:
|
| 176 |
if self.state is None:
|
|
|
|
| 192 |
"redundant_tool_calls": self.state.redundant_tool_calls,
|
| 193 |
"task_answer": self.state.task.answer,
|
| 194 |
"agent_answer": self.state.answer,
|
| 195 |
+
"graph_f1": compute_graph_f1(self.memory_graph.edges, self.state.task.supporting_edges),
|
| 196 |
+
"reward_components": dict(self.state.reward_components),
|
| 197 |
}
|
src/osint_env/env/reward.py
CHANGED
|
@@ -1,12 +1,417 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
def edge_in_truth(edge: Edge, task: TaskInstance) -> bool:
|
| 7 |
return any(e.src == edge.src and e.rel == edge.rel and e.dst == edge.dst for e in task.supporting_edges)
|
| 8 |
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
def compute_graph_f1(pred_edges: list[Edge], truth_edges: list[Edge]) -> float:
|
| 11 |
pred = {(e.src, e.rel, e.dst) for e in pred_edges}
|
| 12 |
truth = {(e.src, e.rel, e.dst) for e in truth_edges}
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
import re
|
| 6 |
+
from collections import Counter
|
| 7 |
+
from dataclasses import asdict, dataclass
|
| 8 |
+
|
| 9 |
+
from osint_env.domain.models import CanonicalGraph, Edge, TaskInstance
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass(slots=True)
|
| 13 |
+
class RewardModel:
|
| 14 |
+
relation_idf: dict[str, float]
|
| 15 |
+
max_relation_idf: float
|
| 16 |
+
hub_penalty: dict[str, float]
|
| 17 |
+
max_hub_penalty: float
|
| 18 |
+
type_priors: dict[tuple[str, str, str], float]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass(slots=True)
|
| 22 |
+
class EdgeRewardBreakdown:
|
| 23 |
+
total: float
|
| 24 |
+
global_accuracy: float
|
| 25 |
+
soft_shaping: float
|
| 26 |
+
efficiency: float
|
| 27 |
+
diversity: float
|
| 28 |
+
relation_informativeness: float
|
| 29 |
+
entity_informativeness: float
|
| 30 |
+
connectivity_gain: float
|
| 31 |
+
|
| 32 |
+
def to_dict(self) -> dict[str, float]:
|
| 33 |
+
return asdict(self)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass(slots=True)
|
| 37 |
+
class AnswerRewardBreakdown:
|
| 38 |
+
total: float
|
| 39 |
+
format_reward: float
|
| 40 |
+
correctness: float
|
| 41 |
+
knowledge_carrier: float
|
| 42 |
+
knowledge_indexing: float
|
| 43 |
+
connectivity: float
|
| 44 |
+
graph_f1: float
|
| 45 |
+
efficiency: float
|
| 46 |
+
compactness: float
|
| 47 |
+
relation_informativeness: float
|
| 48 |
+
entity_informativeness: float
|
| 49 |
+
repetition_penalty: float
|
| 50 |
+
|
| 51 |
+
def to_dict(self) -> dict[str, float]:
|
| 52 |
+
return asdict(self)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def build_reward_model(graph: CanonicalGraph) -> RewardModel:
|
| 56 |
+
relation_freq: Counter[str] = Counter(e.rel for e in graph.edges)
|
| 57 |
+
total_edges = max(1, len(graph.edges))
|
| 58 |
+
relation_idf = {
|
| 59 |
+
rel: math.log((1.0 + total_edges) / (1.0 + freq)) + 1.0 for rel, freq in relation_freq.items()
|
| 60 |
+
}
|
| 61 |
+
max_relation_idf = max(relation_idf.values()) if relation_idf else 1.0
|
| 62 |
+
|
| 63 |
+
degree: Counter[str] = Counter()
|
| 64 |
+
for edge in graph.edges:
|
| 65 |
+
degree[edge.src] += 1
|
| 66 |
+
degree[edge.dst] += 1
|
| 67 |
+
hub_penalty = {node_id: math.log(1.0 + deg) for node_id, deg in degree.items()}
|
| 68 |
+
max_hub_penalty = max(hub_penalty.values()) if hub_penalty else 1.0
|
| 69 |
+
|
| 70 |
+
type_counts: Counter[tuple[str, str, str]] = Counter()
|
| 71 |
+
rel_counts: Counter[str] = Counter()
|
| 72 |
+
for edge in graph.edges:
|
| 73 |
+
src = graph.nodes.get(edge.src)
|
| 74 |
+
dst = graph.nodes.get(edge.dst)
|
| 75 |
+
if src is None or dst is None:
|
| 76 |
+
continue
|
| 77 |
+
key = (str(src.node_type.value), edge.rel, str(dst.node_type.value))
|
| 78 |
+
type_counts[key] += 1
|
| 79 |
+
rel_counts[edge.rel] += 1
|
| 80 |
+
type_priors = {
|
| 81 |
+
key: count / max(1, rel_counts[key[1]]) for key, count in type_counts.items()
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
return RewardModel(
|
| 85 |
+
relation_idf=relation_idf,
|
| 86 |
+
max_relation_idf=max_relation_idf,
|
| 87 |
+
hub_penalty=hub_penalty,
|
| 88 |
+
max_hub_penalty=max_hub_penalty,
|
| 89 |
+
type_priors=type_priors,
|
| 90 |
+
)
|
| 91 |
|
| 92 |
|
| 93 |
def edge_in_truth(edge: Edge, task: TaskInstance) -> bool:
|
| 94 |
return any(e.src == edge.src and e.rel == edge.rel and e.dst == edge.dst for e in task.supporting_edges)
|
| 95 |
|
| 96 |
|
| 97 |
+
def _cosine(a: Counter[str], b: Counter[str]) -> float:
|
| 98 |
+
common = set(a) & set(b)
|
| 99 |
+
num = sum(a[t] * b[t] for t in common)
|
| 100 |
+
den = math.sqrt(sum(v * v for v in a.values())) * math.sqrt(sum(v * v for v in b.values()))
|
| 101 |
+
return (num / den) if den else 0.0
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _edge_signature(edge: Edge) -> Counter[str]:
|
| 105 |
+
# Approximate path/edge embedding using relation and endpoint prefixes.
|
| 106 |
+
src_prefix = edge.src.split("_", 1)[0]
|
| 107 |
+
dst_prefix = edge.dst.split("_", 1)[0]
|
| 108 |
+
return Counter({f"rel:{edge.rel}": 2, f"src:{src_prefix}": 1, f"dst:{dst_prefix}": 1})
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def _soft_fact_score(edge: Edge, model: RewardModel, graph: CanonicalGraph) -> float:
|
| 112 |
+
if any(e.src == edge.src and e.rel == edge.rel and e.dst == edge.dst for e in graph.edges):
|
| 113 |
+
return 1.0
|
| 114 |
+
|
| 115 |
+
src = graph.nodes.get(edge.src)
|
| 116 |
+
dst = graph.nodes.get(edge.dst)
|
| 117 |
+
if src is None or dst is None:
|
| 118 |
+
return 0.0
|
| 119 |
+
|
| 120 |
+
type_key = (str(src.node_type.value), edge.rel, str(dst.node_type.value))
|
| 121 |
+
prior = model.type_priors.get(type_key, 0.0)
|
| 122 |
+
|
| 123 |
+
# A tiny domain heuristic: alias links are common and worth soft credit even without exact support edge.
|
| 124 |
+
alias_bias = 0.2 if (edge.rel == "alias_of" and edge.src.startswith("alias_") and edge.dst.startswith("user_")) else 0.0
|
| 125 |
+
relation_exists = any(e.rel == edge.rel for e in graph.edges)
|
| 126 |
+
relation_bonus = 0.1 if relation_exists else 0.0
|
| 127 |
+
return max(0.0, min(1.0, 0.1 + (0.65 * prior) + alias_bias + relation_bonus))
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _normalized_relation_info(rel: str, model: RewardModel) -> float:
|
| 131 |
+
idf = model.relation_idf.get(rel, 1.0)
|
| 132 |
+
return idf / max(1e-6, model.max_relation_idf)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _normalized_entity_info(src: str, dst: str, model: RewardModel) -> float:
|
| 136 |
+
src_h = model.hub_penalty.get(src, 0.0)
|
| 137 |
+
dst_h = model.hub_penalty.get(dst, 0.0)
|
| 138 |
+
mean_hub = (src_h + dst_h) / 2.0
|
| 139 |
+
# UniRel-style preference for low-degree intermediates: lower hub penalty -> higher informativeness.
|
| 140 |
+
return 1.0 - (mean_hub / max(1e-6, model.max_hub_penalty))
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _is_reachable_undirected(edges: list[Edge], src: str, dst: str) -> bool:
|
| 144 |
+
if src == dst:
|
| 145 |
+
return True
|
| 146 |
+
adj: dict[str, set[str]] = {}
|
| 147 |
+
for edge in edges:
|
| 148 |
+
adj.setdefault(edge.src, set()).add(edge.dst)
|
| 149 |
+
adj.setdefault(edge.dst, set()).add(edge.src)
|
| 150 |
+
seen = {src}
|
| 151 |
+
stack = [src]
|
| 152 |
+
while stack:
|
| 153 |
+
node = stack.pop()
|
| 154 |
+
for nxt in adj.get(node, set()):
|
| 155 |
+
if nxt == dst:
|
| 156 |
+
return True
|
| 157 |
+
if nxt not in seen:
|
| 158 |
+
seen.add(nxt)
|
| 159 |
+
stack.append(nxt)
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def _connectivity_gain(edge: Edge, existing_edges: list[Edge]) -> float:
|
| 164 |
+
# Reward edges that bridge disconnected regions and penalize already-connected shortcuts.
|
| 165 |
+
if edge.src == edge.dst:
|
| 166 |
+
return -0.06
|
| 167 |
+
already_connected = _is_reachable_undirected(existing_edges, edge.src, edge.dst)
|
| 168 |
+
if already_connected:
|
| 169 |
+
return -0.03
|
| 170 |
+
return 0.10
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def compute_edge_reward(
|
| 174 |
+
edge: Edge,
|
| 175 |
+
task: TaskInstance,
|
| 176 |
+
existing_edges: list[Edge],
|
| 177 |
+
step_count: int,
|
| 178 |
+
model: RewardModel,
|
| 179 |
+
graph: CanonicalGraph,
|
| 180 |
+
) -> EdgeRewardBreakdown:
|
| 181 |
+
in_truth = edge_in_truth(edge, task)
|
| 182 |
+
|
| 183 |
+
# DeepPath-inspired global accuracy term.
|
| 184 |
+
global_accuracy = 0.85 if in_truth else -0.55
|
| 185 |
+
|
| 186 |
+
# D18 reward shaping: R = Rb + (1 - Rb) * f, where f is a soft fact plausibility score.
|
| 187 |
+
base_reward = 1.0 if in_truth else 0.0
|
| 188 |
+
shaped = base_reward + ((1.0 - base_reward) * _soft_fact_score(edge, model, graph))
|
| 189 |
+
soft_shaping = 0.30 * (shaped - 0.5)
|
| 190 |
+
|
| 191 |
+
# DeepPath-inspired efficiency term: earlier useful edges are better.
|
| 192 |
+
efficiency = 0.10 * (1.0 / max(1, step_count))
|
| 193 |
+
|
| 194 |
+
# DeepPath-inspired diversity term: discourage repeated edge patterns.
|
| 195 |
+
if not existing_edges:
|
| 196 |
+
diversity = 0.08
|
| 197 |
+
else:
|
| 198 |
+
new_sig = _edge_signature(edge)
|
| 199 |
+
avg_similarity = sum(_cosine(new_sig, _edge_signature(e)) for e in existing_edges) / len(existing_edges)
|
| 200 |
+
novelty = 1.0 - avg_similarity
|
| 201 |
+
diversity = 0.14 * (novelty - 0.5)
|
| 202 |
+
|
| 203 |
+
# UniRel-style informativeness terms.
|
| 204 |
+
relation_informativeness = 0.12 * (_normalized_relation_info(edge.rel, model) - 0.5)
|
| 205 |
+
entity_informativeness = 0.12 * (_normalized_entity_info(edge.src, edge.dst, model) - 0.5)
|
| 206 |
+
|
| 207 |
+
# Additional structural utility shaping for KG construction.
|
| 208 |
+
connectivity_gain = _connectivity_gain(edge, existing_edges)
|
| 209 |
+
|
| 210 |
+
total = (
|
| 211 |
+
global_accuracy
|
| 212 |
+
+ soft_shaping
|
| 213 |
+
+ efficiency
|
| 214 |
+
+ diversity
|
| 215 |
+
+ relation_informativeness
|
| 216 |
+
+ entity_informativeness
|
| 217 |
+
+ connectivity_gain
|
| 218 |
+
)
|
| 219 |
+
return EdgeRewardBreakdown(
|
| 220 |
+
total=total,
|
| 221 |
+
global_accuracy=global_accuracy,
|
| 222 |
+
soft_shaping=soft_shaping,
|
| 223 |
+
efficiency=efficiency,
|
| 224 |
+
diversity=diversity,
|
| 225 |
+
relation_informativeness=relation_informativeness,
|
| 226 |
+
entity_informativeness=entity_informativeness,
|
| 227 |
+
connectivity_gain=connectivity_gain,
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def _connectivity_ratio(pred_edges: list[Edge], task: TaskInstance) -> float:
|
| 232 |
+
nodes = {e.src for e in task.supporting_edges} | {e.dst for e in task.supporting_edges}
|
| 233 |
+
if len(nodes) <= 1:
|
| 234 |
+
return 1.0
|
| 235 |
+
|
| 236 |
+
adj: dict[str, set[str]] = {}
|
| 237 |
+
for edge in pred_edges:
|
| 238 |
+
adj.setdefault(edge.src, set()).add(edge.dst)
|
| 239 |
+
adj.setdefault(edge.dst, set()).add(edge.src)
|
| 240 |
+
|
| 241 |
+
start = next(iter(nodes))
|
| 242 |
+
seen = {start}
|
| 243 |
+
stack = [start]
|
| 244 |
+
while stack:
|
| 245 |
+
cur = stack.pop()
|
| 246 |
+
for nxt in adj.get(cur, set()):
|
| 247 |
+
if nxt not in seen:
|
| 248 |
+
seen.add(nxt)
|
| 249 |
+
stack.append(nxt)
|
| 250 |
+
return len(seen & nodes) / max(1, len(nodes))
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def _knowledge_indexing_recall(task: TaskInstance, tool_outputs: list[dict[str, object]]) -> float:
|
| 254 |
+
gold_terms = {task.answer.lower()}
|
| 255 |
+
for edge in task.supporting_edges:
|
| 256 |
+
gold_terms.add(edge.src.lower())
|
| 257 |
+
gold_terms.add(edge.dst.lower())
|
| 258 |
+
gold_terms.add(edge.rel.lower())
|
| 259 |
+
|
| 260 |
+
serialized = json.dumps(tool_outputs).lower()
|
| 261 |
+
covered = sum(1 for term in gold_terms if term and term in serialized)
|
| 262 |
+
return covered / max(1, len(gold_terms))
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def _knowledge_carrier_reward(pred_edges: list[Edge], task: TaskInstance) -> float:
|
| 266 |
+
pred = {(e.src, e.rel, e.dst) for e in pred_edges}
|
| 267 |
+
truth = {(e.src, e.rel, e.dst) for e in task.supporting_edges}
|
| 268 |
+
deducible = bool(truth & pred)
|
| 269 |
+
return 0.4 if deducible else -0.2
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def _extract_query_entities(question: str) -> set[str]:
|
| 273 |
+
pattern = r"\b(?:alias|user|org|loc|post|thr|thread|event)_[a-zA-Z0-9_]+\b"
|
| 274 |
+
return set(re.findall(pattern, question))
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def _max_connected_seed_count(pred_edges: list[Edge], seeds: set[str]) -> int:
|
| 278 |
+
if not seeds:
|
| 279 |
+
return 0
|
| 280 |
+
adj: dict[str, set[str]] = {}
|
| 281 |
+
for edge in pred_edges:
|
| 282 |
+
adj.setdefault(edge.src, set()).add(edge.dst)
|
| 283 |
+
adj.setdefault(edge.dst, set()).add(edge.src)
|
| 284 |
+
|
| 285 |
+
best = 1
|
| 286 |
+
for seed in seeds:
|
| 287 |
+
seen = {seed}
|
| 288 |
+
stack = [seed]
|
| 289 |
+
while stack:
|
| 290 |
+
cur = stack.pop()
|
| 291 |
+
for nxt in adj.get(cur, set()):
|
| 292 |
+
if nxt not in seen:
|
| 293 |
+
seen.add(nxt)
|
| 294 |
+
stack.append(nxt)
|
| 295 |
+
connected_seed_count = len(seeds & seen)
|
| 296 |
+
best = max(best, connected_seed_count)
|
| 297 |
+
return best
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def _unirel_connectivity_score(pred_edges: list[Edge], seeds: set[str]) -> float:
|
| 301 |
+
# UniRel-style discrete connectivity range projected to [-1, 1] for stable weighting.
|
| 302 |
+
n = len(seeds)
|
| 303 |
+
if n <= 1:
|
| 304 |
+
return 0.0
|
| 305 |
+
|
| 306 |
+
connected = _max_connected_seed_count(pred_edges, seeds)
|
| 307 |
+
raw = -math.floor(n / 2) + max(0, connected - 1)
|
| 308 |
+
lo = -math.floor(n / 2)
|
| 309 |
+
hi = math.ceil(n / 2) - 1
|
| 310 |
+
if hi <= lo:
|
| 311 |
+
return 0.0
|
| 312 |
+
return ((raw - lo) / (hi - lo)) * 2.0 - 1.0
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def _subgraph_relation_informativeness(pred_edges: list[Edge], model: RewardModel | None) -> float:
|
| 316 |
+
if not pred_edges or model is None:
|
| 317 |
+
return 0.0
|
| 318 |
+
avg = sum(_normalized_relation_info(edge.rel, model) for edge in pred_edges) / len(pred_edges)
|
| 319 |
+
return avg - 0.5
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def _subgraph_entity_informativeness(pred_edges: list[Edge], model: RewardModel | None) -> float:
|
| 323 |
+
if not pred_edges or model is None:
|
| 324 |
+
return 0.0
|
| 325 |
+
avg = sum(_normalized_entity_info(edge.src, edge.dst, model) for edge in pred_edges) / len(pred_edges)
|
| 326 |
+
return avg - 0.5
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def _relation_repetition_ratio(pred_edges: list[Edge]) -> float:
|
| 330 |
+
if len(pred_edges) <= 1:
|
| 331 |
+
return 0.0
|
| 332 |
+
rels = [edge.rel for edge in pred_edges]
|
| 333 |
+
unique = len(set(rels))
|
| 334 |
+
return 1.0 - (unique / len(rels))
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def _deducible_answer(proposed_answer: str, task: TaskInstance, pred_edges: list[Edge]) -> bool:
|
| 338 |
+
if proposed_answer != task.answer:
|
| 339 |
+
return False
|
| 340 |
+
truth = {(edge.src, edge.rel, edge.dst) for edge in task.supporting_edges}
|
| 341 |
+
pred = {(edge.src, edge.rel, edge.dst) for edge in pred_edges}
|
| 342 |
+
if truth & pred:
|
| 343 |
+
return True
|
| 344 |
+
|
| 345 |
+
seeds = _extract_query_entities(task.question)
|
| 346 |
+
if not seeds:
|
| 347 |
+
return False
|
| 348 |
+
for seed in seeds:
|
| 349 |
+
if _is_reachable_undirected(pred_edges, seed, proposed_answer):
|
| 350 |
+
return True
|
| 351 |
+
return False
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def compute_answer_reward(
|
| 355 |
+
proposed_answer: str,
|
| 356 |
+
task: TaskInstance,
|
| 357 |
+
pred_edges: list[Edge],
|
| 358 |
+
tool_outputs: list[dict[str, object]],
|
| 359 |
+
step_count: int,
|
| 360 |
+
model: RewardModel | None = None,
|
| 361 |
+
) -> AnswerRewardBreakdown:
|
| 362 |
+
format_reward = 0.15 if proposed_answer else -0.55
|
| 363 |
+
correctness = 1.15 if proposed_answer == task.answer else -1.0
|
| 364 |
+
|
| 365 |
+
# AutoGraph-R1 style task utility decomposition.
|
| 366 |
+
knowledge_carrier = 0.50 if _deducible_answer(proposed_answer, task, pred_edges) else -0.25
|
| 367 |
+
knowledge_indexing = 0.45 * _knowledge_indexing_recall(task, tool_outputs)
|
| 368 |
+
|
| 369 |
+
# UniRel-style connectivity over seed entities.
|
| 370 |
+
seed_entities = _extract_query_entities(task.question)
|
| 371 |
+
seed_entities.add(task.answer)
|
| 372 |
+
connectivity = 0.30 * _unirel_connectivity_score(pred_edges, seed_entities)
|
| 373 |
+
|
| 374 |
+
graph_f1 = 0.55 * compute_graph_f1(pred_edges, task.supporting_edges)
|
| 375 |
+
efficiency = 0.12 * (1.0 / max(1, step_count))
|
| 376 |
+
|
| 377 |
+
extra_edges = max(0, len(pred_edges) - len(task.supporting_edges))
|
| 378 |
+
compactness = -0.05 * extra_edges
|
| 379 |
+
|
| 380 |
+
relation_informativeness = 0.12 * _subgraph_relation_informativeness(pred_edges, model)
|
| 381 |
+
entity_informativeness = 0.12 * _subgraph_entity_informativeness(pred_edges, model)
|
| 382 |
+
|
| 383 |
+
# AutoGraph-R1 repetition control variant used in larger models.
|
| 384 |
+
repetition_penalty = -0.10 * _relation_repetition_ratio(pred_edges)
|
| 385 |
+
|
| 386 |
+
total = (
|
| 387 |
+
format_reward
|
| 388 |
+
+ correctness
|
| 389 |
+
+ knowledge_carrier
|
| 390 |
+
+ knowledge_indexing
|
| 391 |
+
+ connectivity
|
| 392 |
+
+ graph_f1
|
| 393 |
+
+ efficiency
|
| 394 |
+
+ compactness
|
| 395 |
+
+ relation_informativeness
|
| 396 |
+
+ entity_informativeness
|
| 397 |
+
+ repetition_penalty
|
| 398 |
+
)
|
| 399 |
+
return AnswerRewardBreakdown(
|
| 400 |
+
total=total,
|
| 401 |
+
format_reward=format_reward,
|
| 402 |
+
correctness=correctness,
|
| 403 |
+
knowledge_carrier=knowledge_carrier,
|
| 404 |
+
knowledge_indexing=knowledge_indexing,
|
| 405 |
+
connectivity=connectivity,
|
| 406 |
+
graph_f1=graph_f1,
|
| 407 |
+
efficiency=efficiency,
|
| 408 |
+
compactness=compactness,
|
| 409 |
+
relation_informativeness=relation_informativeness,
|
| 410 |
+
entity_informativeness=entity_informativeness,
|
| 411 |
+
repetition_penalty=repetition_penalty,
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
|
| 415 |
def compute_graph_f1(pred_edges: list[Edge], truth_edges: list[Edge]) -> float:
|
| 416 |
pred = {(e.src, e.rel, e.dst) for e in pred_edges}
|
| 417 |
truth = {(e.src, e.rel, e.dst) for e in truth_edges}
|
src/osint_env/env/spawn_reward_hooks.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def critical_steps(main_steps: list[int], parallel_subagent_steps: list[list[int]]) -> int:
|
| 7 |
+
"""Compute critical-step latency proxy used in Kimi-style PARL shaping.
|
| 8 |
+
|
| 9 |
+
For each stage t, we add:
|
| 10 |
+
Smain(t) + max_i Ssub,i(t)
|
| 11 |
+
where Ssub,i(t) is the i-th sub-agent step count for that stage.
|
| 12 |
+
"""
|
| 13 |
+
if len(main_steps) != len(parallel_subagent_steps):
|
| 14 |
+
raise ValueError("main_steps and parallel_subagent_steps must have the same length")
|
| 15 |
+
|
| 16 |
+
total = 0
|
| 17 |
+
for stage_main, stage_sub in zip(main_steps, parallel_subagent_steps):
|
| 18 |
+
main = max(0, int(stage_main))
|
| 19 |
+
longest_sub = max((max(0, int(v)) for v in stage_sub), default=0)
|
| 20 |
+
total += main + longest_sub
|
| 21 |
+
return total
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def parl_style_spawn_reward(
|
| 25 |
+
task_outcome_reward: float,
|
| 26 |
+
spawn_count: int,
|
| 27 |
+
finished_subtasks: int,
|
| 28 |
+
critical_steps: int,
|
| 29 |
+
lambda_parallel: float = 0.15,
|
| 30 |
+
lambda_finish: float = 0.20,
|
| 31 |
+
anneal: float = 1.0,
|
| 32 |
+
breadth: int | None = None,
|
| 33 |
+
depth: int | None = None,
|
| 34 |
+
max_parallel_hint: int | None = None,
|
| 35 |
+
) -> float:
|
| 36 |
+
"""Kimi K2.5 inspired PARL reward utility for future multi-agent branches.
|
| 37 |
+
|
| 38 |
+
This helper intentionally does not orchestrate agents. It only exposes the reward shape:
|
| 39 |
+
|
| 40 |
+
r_parl = r_perf + a * (lambda_parallel * r_parallel + lambda_finish * r_finish + r_latency)
|
| 41 |
+
|
| 42 |
+
where:
|
| 43 |
+
- r_parallel encourages non-zero agent spawning (avoids serial collapse)
|
| 44 |
+
- r_finish rewards meaningful completion, preventing spawn-only reward hacking
|
| 45 |
+
- r_latency favors lower critical-step execution paths
|
| 46 |
+
|
| 47 |
+
The optional breadth/depth controls are small shaping terms for future branches where
|
| 48 |
+
orchestration state includes tree shape telemetry.
|
| 49 |
+
"""
|
| 50 |
+
spawn_count = max(0, int(spawn_count))
|
| 51 |
+
finished_subtasks = max(0, int(finished_subtasks))
|
| 52 |
+
critical_steps = max(1, int(critical_steps))
|
| 53 |
+
anneal = max(0.0, min(1.0, anneal))
|
| 54 |
+
lambda_parallel = max(0.0, float(lambda_parallel))
|
| 55 |
+
lambda_finish = max(0.0, float(lambda_finish))
|
| 56 |
+
breadth = max(0, int(breadth or 0))
|
| 57 |
+
depth = max(0, int(depth or 0))
|
| 58 |
+
max_parallel_hint = max(0, int(max_parallel_hint or 0))
|
| 59 |
+
|
| 60 |
+
if spawn_count == 0:
|
| 61 |
+
r_parallel = 0.0
|
| 62 |
+
r_finish = 0.0
|
| 63 |
+
else:
|
| 64 |
+
# Saturating incentive for parallelism so reward cannot grow unbounded with spawns.
|
| 65 |
+
r_parallel = math.tanh(spawn_count / 4.0)
|
| 66 |
+
if max_parallel_hint > 0:
|
| 67 |
+
utilization = min(1.0, spawn_count / max_parallel_hint)
|
| 68 |
+
r_parallel *= (0.7 + (0.3 * utilization))
|
| 69 |
+
|
| 70 |
+
r_finish = min(1.0, finished_subtasks / spawn_count)
|
| 71 |
+
|
| 72 |
+
if breadth > 0:
|
| 73 |
+
breadth_bonus = 0.04 * math.tanh(breadth / 6.0)
|
| 74 |
+
else:
|
| 75 |
+
breadth_bonus = 0.0
|
| 76 |
+
|
| 77 |
+
if depth > 0:
|
| 78 |
+
# Mild depth penalty discourages brittle over-decomposition chains.
|
| 79 |
+
depth_penalty = -0.03 * math.tanh(max(0, depth - 1) / 4.0)
|
| 80 |
+
else:
|
| 81 |
+
depth_penalty = 0.0
|
| 82 |
+
|
| 83 |
+
# Optional latency shaping hook using critical steps (higher is worse).
|
| 84 |
+
r_latency = 0.05 * (1.0 / critical_steps)
|
| 85 |
+
|
| 86 |
+
auxiliary = (
|
| 87 |
+
(lambda_parallel * r_parallel)
|
| 88 |
+
+ (lambda_finish * r_finish)
|
| 89 |
+
+ r_latency
|
| 90 |
+
+ breadth_bonus
|
| 91 |
+
+ depth_penalty
|
| 92 |
+
)
|
| 93 |
+
return float(task_outcome_reward) + (anneal * auxiliary)
|
src/osint_env/eval/leaderboard.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _utc_now() -> str:
|
| 10 |
+
return datetime.now(tz=timezone.utc).replace(microsecond=0).isoformat()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_leaderboard(path: str | Path) -> list[dict[str, Any]]:
|
| 14 |
+
file_path = Path(path)
|
| 15 |
+
if not file_path.exists():
|
| 16 |
+
return []
|
| 17 |
+
with file_path.open("r", encoding="utf-8") as f:
|
| 18 |
+
data = json.load(f)
|
| 19 |
+
if not isinstance(data, list):
|
| 20 |
+
return []
|
| 21 |
+
return data
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def save_leaderboard(path: str | Path, records: list[dict[str, Any]]) -> None:
|
| 25 |
+
file_path = Path(path)
|
| 26 |
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 27 |
+
with file_path.open("w", encoding="utf-8") as f:
|
| 28 |
+
json.dump(records, f, indent=2, sort_keys=True)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _metric_value(record: dict[str, Any], sort_by: str) -> float:
|
| 32 |
+
metrics = record.get("metrics", {})
|
| 33 |
+
return float(metrics.get(sort_by, 0.0))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def sorted_leaderboard(records: list[dict[str, Any]], sort_by: str = "leaderboard_score") -> list[dict[str, Any]]:
|
| 37 |
+
return sorted(records, key=lambda r: _metric_value(r, sort_by), reverse=True)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def append_leaderboard_record(
|
| 41 |
+
path: str | Path,
|
| 42 |
+
summary: dict[str, Any],
|
| 43 |
+
episodes: int,
|
| 44 |
+
run_name: str | None = None,
|
| 45 |
+
config: dict[str, Any] | None = None,
|
| 46 |
+
) -> dict[str, Any]:
|
| 47 |
+
records = load_leaderboard(path)
|
| 48 |
+
run_id = f"run_{len(records) + 1:04d}"
|
| 49 |
+
record = {
|
| 50 |
+
"run_id": run_id,
|
| 51 |
+
"run_name": run_name or run_id,
|
| 52 |
+
"created_at": _utc_now(),
|
| 53 |
+
"episodes": int(episodes),
|
| 54 |
+
"config": config or {},
|
| 55 |
+
"metrics": summary,
|
| 56 |
+
}
|
| 57 |
+
records.append(record)
|
| 58 |
+
save_leaderboard(path, records)
|
| 59 |
+
return record
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def render_leaderboard_table(records: list[dict[str, Any]], top_k: int = 20, sort_by: str = "leaderboard_score") -> str:
|
| 63 |
+
ranked = sorted_leaderboard(records, sort_by=sort_by)[:top_k]
|
| 64 |
+
header = "| rank | run | score | success | graph_f1 | retrieval | structural | spawn | reward | tool_eff |\n"
|
| 65 |
+
sep = "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|\n"
|
| 66 |
+
rows: list[str] = []
|
| 67 |
+
for idx, rec in enumerate(ranked, start=1):
|
| 68 |
+
m = rec.get("metrics", {})
|
| 69 |
+
rows.append(
|
| 70 |
+
"| {rank} | {run} | {score:.4f} | {succ:.3f} | {f1:.3f} | {retrieval:.3f} | {structural:.3f} | {spawn:.3f} | {reward:.3f} | {tool:.3f} |".format(
|
| 71 |
+
rank=idx,
|
| 72 |
+
run=rec.get("run_name", rec.get("run_id", "run")),
|
| 73 |
+
score=float(m.get("leaderboard_score", 0.0)),
|
| 74 |
+
succ=float(m.get("task_success_rate", 0.0)),
|
| 75 |
+
f1=float(m.get("avg_graph_f1", 0.0)),
|
| 76 |
+
retrieval=float(m.get("retrieval_signal", 0.0)),
|
| 77 |
+
structural=float(m.get("structural_signal", 0.0)),
|
| 78 |
+
spawn=float(m.get("spawn_signal", 0.0)),
|
| 79 |
+
reward=float(m.get("avg_reward", 0.0)),
|
| 80 |
+
tool=float(m.get("tool_efficiency", 0.0)),
|
| 81 |
+
)
|
| 82 |
+
)
|
| 83 |
+
return header + sep + "\n".join(rows)
|
src/osint_env/eval/metrics.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
|
| 5 |
|
|
@@ -14,6 +15,19 @@ class EvalMetrics:
|
|
| 14 |
deanonymization_total: int = 0
|
| 15 |
deanonymization_success: int = 0
|
| 16 |
graph_f1_scores: list[float] = field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def add(self, info: dict, task_type: str, graph_f1: float) -> None:
|
| 19 |
self.episodes += 1
|
|
@@ -24,17 +38,92 @@ class EvalMetrics:
|
|
| 24 |
self.total_redundant_tool_calls += int(info.get("redundant_tool_calls", 0))
|
| 25 |
self.total_reward += float(info.get("total_reward", 0.0))
|
| 26 |
self.graph_f1_scores.append(graph_f1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
if task_type == "identity_resolution":
|
| 28 |
self.deanonymization_total += 1
|
| 29 |
self.deanonymization_success += int(ok)
|
| 30 |
|
| 31 |
def summary(self) -> dict:
|
| 32 |
episodes = max(1, self.episodes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return {
|
| 34 |
-
"task_success_rate":
|
| 35 |
-
"tool_efficiency":
|
| 36 |
-
"avg_graph_f1":
|
| 37 |
"avg_steps_to_solution": self.total_steps / episodes,
|
| 38 |
-
"deanonymization_accuracy":
|
| 39 |
-
"avg_reward":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
}
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
import math
|
| 4 |
from dataclasses import dataclass, field
|
| 5 |
|
| 6 |
|
|
|
|
| 15 |
deanonymization_total: int = 0
|
| 16 |
deanonymization_success: int = 0
|
| 17 |
graph_f1_scores: list[float] = field(default_factory=list)
|
| 18 |
+
total_knowledge_carrier: float = 0.0
|
| 19 |
+
total_knowledge_indexing: float = 0.0
|
| 20 |
+
total_connectivity: float = 0.0
|
| 21 |
+
total_format_reward: float = 0.0
|
| 22 |
+
total_relation_informativeness: float = 0.0
|
| 23 |
+
total_entity_informativeness: float = 0.0
|
| 24 |
+
total_diversity: float = 0.0
|
| 25 |
+
total_soft_shaping: float = 0.0
|
| 26 |
+
total_connectivity_gain: float = 0.0
|
| 27 |
+
total_compactness: float = 0.0
|
| 28 |
+
total_spawn_count: int = 0
|
| 29 |
+
total_spawn_finished_subtasks: int = 0
|
| 30 |
+
total_spawn_critical_steps: int = 0
|
| 31 |
|
| 32 |
def add(self, info: dict, task_type: str, graph_f1: float) -> None:
|
| 33 |
self.episodes += 1
|
|
|
|
| 38 |
self.total_redundant_tool_calls += int(info.get("redundant_tool_calls", 0))
|
| 39 |
self.total_reward += float(info.get("total_reward", 0.0))
|
| 40 |
self.graph_f1_scores.append(graph_f1)
|
| 41 |
+
components = info.get("reward_components", {})
|
| 42 |
+
self.total_knowledge_carrier += float(components.get("knowledge_carrier", 0.0))
|
| 43 |
+
self.total_knowledge_indexing += float(components.get("knowledge_indexing", 0.0))
|
| 44 |
+
self.total_connectivity += float(components.get("connectivity", 0.0))
|
| 45 |
+
self.total_format_reward += float(components.get("format_reward", 0.0))
|
| 46 |
+
self.total_relation_informativeness += float(components.get("relation_informativeness", 0.0))
|
| 47 |
+
self.total_entity_informativeness += float(components.get("entity_informativeness", 0.0))
|
| 48 |
+
self.total_diversity += float(components.get("diversity", 0.0))
|
| 49 |
+
self.total_soft_shaping += float(components.get("soft_shaping", 0.0))
|
| 50 |
+
self.total_connectivity_gain += float(components.get("connectivity_gain", 0.0))
|
| 51 |
+
self.total_compactness += float(components.get("compactness", 0.0))
|
| 52 |
+
self.total_spawn_count += int(info.get("spawn_count", 0))
|
| 53 |
+
self.total_spawn_finished_subtasks += int(info.get("spawn_finished_subtasks", 0))
|
| 54 |
+
self.total_spawn_critical_steps += int(info.get("spawn_critical_steps", 0))
|
| 55 |
if task_type == "identity_resolution":
|
| 56 |
self.deanonymization_total += 1
|
| 57 |
self.deanonymization_success += int(ok)
|
| 58 |
|
| 59 |
def summary(self) -> dict:
|
| 60 |
episodes = max(1, self.episodes)
|
| 61 |
+
task_success_rate = self.success / episodes
|
| 62 |
+
tool_efficiency = 1.0 - (self.total_redundant_tool_calls / max(1, self.total_tool_calls))
|
| 63 |
+
avg_graph_f1 = sum(self.graph_f1_scores) / max(1, len(self.graph_f1_scores))
|
| 64 |
+
deanonymization_accuracy = self.deanonymization_success / max(1, self.deanonymization_total)
|
| 65 |
+
avg_reward = self.total_reward / episodes
|
| 66 |
+
avg_knowledge_carrier = self.total_knowledge_carrier / episodes
|
| 67 |
+
avg_knowledge_indexing = self.total_knowledge_indexing / episodes
|
| 68 |
+
avg_connectivity = self.total_connectivity / episodes
|
| 69 |
+
avg_relation_informativeness = self.total_relation_informativeness / episodes
|
| 70 |
+
avg_entity_informativeness = self.total_entity_informativeness / episodes
|
| 71 |
+
avg_diversity = self.total_diversity / episodes
|
| 72 |
+
avg_soft_shaping = self.total_soft_shaping / episodes
|
| 73 |
+
avg_connectivity_gain = self.total_connectivity_gain / episodes
|
| 74 |
+
avg_compactness = self.total_compactness / episodes
|
| 75 |
+
avg_spawn_count = self.total_spawn_count / episodes
|
| 76 |
+
spawn_completion = self.total_spawn_finished_subtasks / max(1, self.total_spawn_count)
|
| 77 |
+
avg_spawn_critical_steps = self.total_spawn_critical_steps / episodes
|
| 78 |
+
spawn_latency_signal = 1.0 / max(1.0, avg_spawn_critical_steps)
|
| 79 |
+
spawn_signal = max(0.0, min(1.0, 0.6 * spawn_completion + 0.4 * spawn_latency_signal))
|
| 80 |
+
|
| 81 |
+
reward_norm = 1.0 / (1.0 + math.exp(-avg_reward))
|
| 82 |
+
retrieval_signal = max(0.0, min(1.0, 0.5 + 0.35 * avg_knowledge_carrier + 0.35 * avg_knowledge_indexing))
|
| 83 |
+
structural_signal = max(
|
| 84 |
+
0.0,
|
| 85 |
+
min(
|
| 86 |
+
1.0,
|
| 87 |
+
0.5
|
| 88 |
+
+ 0.25 * avg_connectivity
|
| 89 |
+
+ 0.20 * avg_relation_informativeness
|
| 90 |
+
+ 0.20 * avg_entity_informativeness
|
| 91 |
+
+ 0.15 * avg_diversity
|
| 92 |
+
+ 0.10 * avg_connectivity_gain,
|
| 93 |
+
),
|
| 94 |
+
)
|
| 95 |
+
leaderboard_score = (
|
| 96 |
+
0.28 * task_success_rate
|
| 97 |
+
+ 0.20 * avg_graph_f1
|
| 98 |
+
+ 0.12 * tool_efficiency
|
| 99 |
+
+ 0.12 * deanonymization_accuracy
|
| 100 |
+
+ 0.14 * retrieval_signal
|
| 101 |
+
+ 0.09 * structural_signal
|
| 102 |
+
+ 0.05 * reward_norm
|
| 103 |
+
+ 0.04 * spawn_signal
|
| 104 |
+
)
|
| 105 |
return {
|
| 106 |
+
"task_success_rate": task_success_rate,
|
| 107 |
+
"tool_efficiency": tool_efficiency,
|
| 108 |
+
"avg_graph_f1": avg_graph_f1,
|
| 109 |
"avg_steps_to_solution": self.total_steps / episodes,
|
| 110 |
+
"deanonymization_accuracy": deanonymization_accuracy,
|
| 111 |
+
"avg_reward": avg_reward,
|
| 112 |
+
"avg_knowledge_carrier_reward": avg_knowledge_carrier,
|
| 113 |
+
"avg_knowledge_indexing_reward": avg_knowledge_indexing,
|
| 114 |
+
"avg_connectivity_reward": avg_connectivity,
|
| 115 |
+
"avg_format_reward": self.total_format_reward / episodes,
|
| 116 |
+
"avg_relation_informativeness_reward": avg_relation_informativeness,
|
| 117 |
+
"avg_entity_informativeness_reward": avg_entity_informativeness,
|
| 118 |
+
"avg_diversity_reward": avg_diversity,
|
| 119 |
+
"avg_soft_shaping_reward": avg_soft_shaping,
|
| 120 |
+
"avg_connectivity_gain_reward": avg_connectivity_gain,
|
| 121 |
+
"avg_compactness_reward": avg_compactness,
|
| 122 |
+
"avg_spawn_count": avg_spawn_count,
|
| 123 |
+
"spawn_completion_rate": spawn_completion,
|
| 124 |
+
"avg_spawn_critical_steps": avg_spawn_critical_steps,
|
| 125 |
+
"spawn_signal": spawn_signal,
|
| 126 |
+
"retrieval_signal": retrieval_signal,
|
| 127 |
+
"structural_signal": structural_signal,
|
| 128 |
+
"leaderboard_score": leaderboard_score,
|
| 129 |
}
|
src/osint_env/eval/runner.py
CHANGED
|
@@ -1,18 +1,42 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from osint_env.agents.single_agent import SingleAgentRunner
|
|
|
|
| 4 |
from osint_env.env.environment import OSINTEnvironment
|
| 5 |
from osint_env.env.reward import compute_graph_f1
|
| 6 |
from osint_env.eval.metrics import EvalMetrics
|
| 7 |
|
| 8 |
|
| 9 |
-
def run_evaluation(env: OSINTEnvironment, episodes: int = 20) -> dict:
|
| 10 |
metrics = EvalMetrics()
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
for _ in range(episodes):
|
| 13 |
info = runner.run_episode()
|
| 14 |
task_type = env.state.task.task_type if env.state else "unknown"
|
|
|
|
| 15 |
truth = env.state.task.supporting_edges if env.state else []
|
| 16 |
pred = env.memory_graph.edges if env.state else []
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from osint_env.agents.single_agent import SingleAgentRunner
|
| 4 |
+
from osint_env.agents.swarm_agent import SwarmAgentRunner
|
| 5 |
from osint_env.env.environment import OSINTEnvironment
|
| 6 |
from osint_env.env.reward import compute_graph_f1
|
| 7 |
from osint_env.eval.metrics import EvalMetrics
|
| 8 |
|
| 9 |
|
| 10 |
+
def run_evaluation(env: OSINTEnvironment, episodes: int = 20, return_details: bool = False) -> dict:
|
| 11 |
metrics = EvalMetrics()
|
| 12 |
+
if env.config.swarm.enabled:
|
| 13 |
+
runner = SwarmAgentRunner(env=env)
|
| 14 |
+
else:
|
| 15 |
+
runner = SingleAgentRunner(env=env)
|
| 16 |
+
episode_rows: list[dict] = []
|
| 17 |
for _ in range(episodes):
|
| 18 |
info = runner.run_episode()
|
| 19 |
task_type = env.state.task.task_type if env.state else "unknown"
|
| 20 |
+
task_id = env.state.task.task_id if env.state else "unknown"
|
| 21 |
truth = env.state.task.supporting_edges if env.state else []
|
| 22 |
pred = env.memory_graph.edges if env.state else []
|
| 23 |
+
graph_f1 = compute_graph_f1(pred, truth)
|
| 24 |
+
metrics.add(info, task_type=task_type, graph_f1=graph_f1)
|
| 25 |
+
episode_rows.append(
|
| 26 |
+
{
|
| 27 |
+
"task_id": task_id,
|
| 28 |
+
"task_type": task_type,
|
| 29 |
+
"graph_f1": graph_f1,
|
| 30 |
+
"reward": float(info.get("total_reward", 0.0)),
|
| 31 |
+
"steps": int(info.get("step_count", 0)),
|
| 32 |
+
"tool_calls": int(info.get("tool_calls", 0)),
|
| 33 |
+
"success": int(info.get("agent_answer") == info.get("task_answer")),
|
| 34 |
+
"reward_components": dict(info.get("reward_components", {})),
|
| 35 |
+
"spawn_count": int(info.get("spawn_count", 0)),
|
| 36 |
+
"spawn_critical_steps": int(info.get("spawn_critical_steps", 0)),
|
| 37 |
+
}
|
| 38 |
+
)
|
| 39 |
+
summary = metrics.summary()
|
| 40 |
+
if return_details:
|
| 41 |
+
return {"summary": summary, "episodes": episode_rows}
|
| 42 |
+
return summary
|
src/osint_env/viz/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.viz.dashboard import export_dashboard
|
| 2 |
+
|
| 3 |
+
__all__ = ["export_dashboard"]
|
src/osint_env/viz/dashboard.py
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from osint_env.data.generator import PlatformViews
|
| 8 |
+
from osint_env.domain.models import CanonicalGraph, Edge, TaskInstance
|
| 9 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _safe_label(value: str, fallback: str) -> str:
|
| 13 |
+
text = str(value).strip()
|
| 14 |
+
return text if text else fallback
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _canonical_graph_payload(graph: CanonicalGraph) -> dict[str, Any]:
|
| 18 |
+
nodes = []
|
| 19 |
+
for node in graph.nodes.values():
|
| 20 |
+
attrs = node.attrs or {}
|
| 21 |
+
title = "\\n".join(f"{k}: {v}" for k, v in attrs.items())
|
| 22 |
+
label = _safe_label(str(attrs.get("name") or attrs.get("handle") or node.node_id), node.node_id)
|
| 23 |
+
nodes.append(
|
| 24 |
+
{
|
| 25 |
+
"id": node.node_id,
|
| 26 |
+
"label": label,
|
| 27 |
+
"group": str(node.node_type.value),
|
| 28 |
+
"title": title,
|
| 29 |
+
"attrs": attrs,
|
| 30 |
+
}
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
edges = []
|
| 34 |
+
for idx, edge in enumerate(graph.edges):
|
| 35 |
+
edges.append(
|
| 36 |
+
{
|
| 37 |
+
"id": f"c_{idx}",
|
| 38 |
+
"from": edge.src,
|
| 39 |
+
"to": edge.dst,
|
| 40 |
+
"label": edge.rel,
|
| 41 |
+
"arrows": "to",
|
| 42 |
+
"color": "#1f2937",
|
| 43 |
+
"width": 1,
|
| 44 |
+
"confidence": float(edge.confidence),
|
| 45 |
+
"status": "canonical",
|
| 46 |
+
}
|
| 47 |
+
)
|
| 48 |
+
return {"nodes": nodes, "edges": edges}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _edge_key(edge: Edge) -> tuple[str, str, str]:
|
| 52 |
+
return (edge.src, edge.rel, edge.dst)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _episode_graph_payload(pred_edges: list[Edge], truth_edges: list[Edge], graph: CanonicalGraph) -> dict[str, Any]:
|
| 56 |
+
pred = {_edge_key(e): e for e in pred_edges}
|
| 57 |
+
truth = {_edge_key(e): e for e in truth_edges}
|
| 58 |
+
|
| 59 |
+
all_nodes = set()
|
| 60 |
+
all_keys = set(pred) | set(truth)
|
| 61 |
+
for src, _, dst in all_keys:
|
| 62 |
+
all_nodes.add(src)
|
| 63 |
+
all_nodes.add(dst)
|
| 64 |
+
|
| 65 |
+
nodes = []
|
| 66 |
+
for node_id in sorted(all_nodes):
|
| 67 |
+
node = graph.nodes.get(node_id)
|
| 68 |
+
if node is None:
|
| 69 |
+
nodes.append({"id": node_id, "label": node_id, "group": "episode", "attrs": {}})
|
| 70 |
+
continue
|
| 71 |
+
attrs = node.attrs or {}
|
| 72 |
+
label = _safe_label(str(attrs.get("name") or attrs.get("handle") or node_id), node_id)
|
| 73 |
+
nodes.append({"id": node_id, "label": label, "group": str(node.node_type.value), "attrs": attrs})
|
| 74 |
+
|
| 75 |
+
edges = []
|
| 76 |
+
for idx, key in enumerate(sorted(all_keys)):
|
| 77 |
+
src, rel, dst = key
|
| 78 |
+
in_pred = key in pred
|
| 79 |
+
in_truth = key in truth
|
| 80 |
+
if in_pred and in_truth:
|
| 81 |
+
color = "#16a34a"
|
| 82 |
+
dashes = False
|
| 83 |
+
status = "matched"
|
| 84 |
+
elif in_pred:
|
| 85 |
+
color = "#2563eb"
|
| 86 |
+
dashes = False
|
| 87 |
+
status = "pred_only"
|
| 88 |
+
else:
|
| 89 |
+
color = "#f59e0b"
|
| 90 |
+
dashes = True
|
| 91 |
+
status = "truth_only"
|
| 92 |
+
edges.append(
|
| 93 |
+
{
|
| 94 |
+
"id": f"e_{idx}",
|
| 95 |
+
"from": src,
|
| 96 |
+
"to": dst,
|
| 97 |
+
"label": rel,
|
| 98 |
+
"arrows": "to",
|
| 99 |
+
"color": color,
|
| 100 |
+
"dashes": dashes,
|
| 101 |
+
"width": 2,
|
| 102 |
+
"status": status,
|
| 103 |
+
"confidence": float((pred.get(key) or truth.get(key) or Edge(src, rel, dst)).confidence),
|
| 104 |
+
}
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
return {"nodes": nodes, "edges": edges}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _views_payload(views: PlatformViews) -> dict[str, Any]:
|
| 111 |
+
return {
|
| 112 |
+
"microblog_posts": views.microblog_posts,
|
| 113 |
+
"forum_threads": views.forum_threads,
|
| 114 |
+
"profiles": views.profiles,
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _leaderboard_payload(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
| 119 |
+
ranked = sorted(records, key=lambda r: float(r.get("metrics", {}).get("leaderboard_score", 0.0)), reverse=True)
|
| 120 |
+
return ranked[:200]
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def export_dashboard(
|
| 124 |
+
env: OSINTEnvironment,
|
| 125 |
+
evaluation: dict[str, Any],
|
| 126 |
+
leaderboard_records: list[dict[str, Any]],
|
| 127 |
+
output_path: str,
|
| 128 |
+
) -> str:
|
| 129 |
+
summary = evaluation.get("summary", evaluation)
|
| 130 |
+
episodes = evaluation.get("episodes", [])
|
| 131 |
+
|
| 132 |
+
task: TaskInstance | None = env.state.task if env.state else None
|
| 133 |
+
truth_edges = task.supporting_edges if task else []
|
| 134 |
+
pred_edges = env.memory_graph.edges if env.state else []
|
| 135 |
+
|
| 136 |
+
payload = {
|
| 137 |
+
"summary": summary,
|
| 138 |
+
"episodes": episodes,
|
| 139 |
+
"leaderboard": _leaderboard_payload(leaderboard_records),
|
| 140 |
+
"canonical_graph": _canonical_graph_payload(env.graph),
|
| 141 |
+
"episode_graph": _episode_graph_payload(pred_edges, truth_edges, env.graph),
|
| 142 |
+
"views": _views_payload(env.views),
|
| 143 |
+
"task": {
|
| 144 |
+
"task_id": task.task_id if task else "n/a",
|
| 145 |
+
"task_type": task.task_type if task else "n/a",
|
| 146 |
+
"question": task.question if task else "n/a",
|
| 147 |
+
"answer": task.answer if task else "n/a",
|
| 148 |
+
},
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
html = f"""<!doctype html>
|
| 152 |
+
<html lang=\"en\">
|
| 153 |
+
<head>
|
| 154 |
+
<meta charset=\"utf-8\" />
|
| 155 |
+
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
|
| 156 |
+
<title>OSINT Environment Dashboard</title>
|
| 157 |
+
<link rel=\"preconnect\" href=\"https://fonts.googleapis.com\" />
|
| 158 |
+
<link rel=\"preconnect\" href=\"https://fonts.gstatic.com\" crossorigin />
|
| 159 |
+
<link href=\"https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;600&display=swap\" rel=\"stylesheet\" />
|
| 160 |
+
<link href=\"https://unpkg.com/vis-network@9.1.9/styles/vis-network.min.css\" rel=\"stylesheet\" />
|
| 161 |
+
<script src=\"https://unpkg.com/vis-network@9.1.9/standalone/umd/vis-network.min.js\"></script>
|
| 162 |
+
<script src=\"https://cdn.jsdelivr.net/npm/chart.js@4.4.3/dist/chart.umd.min.js\"></script>
|
| 163 |
+
<style>
|
| 164 |
+
:root {{
|
| 165 |
+
--ink: #1d232f;
|
| 166 |
+
--muted: #5f6d7a;
|
| 167 |
+
--line: #d5dfe8;
|
| 168 |
+
--bg: #f5f8fb;
|
| 169 |
+
--card: #ffffff;
|
| 170 |
+
--brand: #0f766e;
|
| 171 |
+
--brand-soft: #d4f4ef;
|
| 172 |
+
--accent: #d97706;
|
| 173 |
+
--accent-soft: #ffe7c2;
|
| 174 |
+
--ok: #15803d;
|
| 175 |
+
--danger: #b91c1c;
|
| 176 |
+
}}
|
| 177 |
+
* {{ box-sizing: border-box; }}
|
| 178 |
+
body {{
|
| 179 |
+
margin: 0;
|
| 180 |
+
color: var(--ink);
|
| 181 |
+
font-family: \"Space Grotesk\", \"Segoe UI\", sans-serif;
|
| 182 |
+
background:
|
| 183 |
+
radial-gradient(1200px 500px at -5% -20%, #d8efe9, transparent 70%),
|
| 184 |
+
radial-gradient(900px 500px at 110% -10%, #ffe9cf, transparent 65%),
|
| 185 |
+
var(--bg);
|
| 186 |
+
}}
|
| 187 |
+
.wrap {{ max-width: 1500px; margin: 0 auto; padding: 20px; }}
|
| 188 |
+
.card {{
|
| 189 |
+
background: var(--card);
|
| 190 |
+
border: 1px solid var(--line);
|
| 191 |
+
border-radius: 18px;
|
| 192 |
+
padding: 16px;
|
| 193 |
+
box-shadow: 0 10px 24px rgba(24, 39, 59, 0.06);
|
| 194 |
+
}}
|
| 195 |
+
.hero {{
|
| 196 |
+
display: grid;
|
| 197 |
+
grid-template-columns: 2.1fr 1fr;
|
| 198 |
+
gap: 14px;
|
| 199 |
+
margin-bottom: 14px;
|
| 200 |
+
}}
|
| 201 |
+
.hero-main {{
|
| 202 |
+
background: linear-gradient(145deg, #f7fffd, #fff8ef);
|
| 203 |
+
border: 1px solid #e6efe8;
|
| 204 |
+
}}
|
| 205 |
+
h1 {{ margin: 0 0 8px; font-size: 30px; letter-spacing: -0.02em; }}
|
| 206 |
+
h2 {{ margin: 0 0 10px; font-size: 18px; letter-spacing: -0.01em; }}
|
| 207 |
+
.muted {{ color: var(--muted); }}
|
| 208 |
+
.pill-row {{ display: flex; gap: 8px; flex-wrap: wrap; margin-top: 8px; }}
|
| 209 |
+
.pill {{
|
| 210 |
+
border: 1px solid #dce8e6;
|
| 211 |
+
background: #fbfffe;
|
| 212 |
+
border-radius: 999px;
|
| 213 |
+
padding: 4px 10px;
|
| 214 |
+
font-size: 12px;
|
| 215 |
+
color: #214742;
|
| 216 |
+
}}
|
| 217 |
+
.stats {{ display: grid; grid-template-columns: repeat(3, minmax(120px, 1fr)); gap: 10px; margin-top: 10px; }}
|
| 218 |
+
.stat {{
|
| 219 |
+
border: 1px dashed #cde2df;
|
| 220 |
+
background: linear-gradient(180deg, #fcfffe, #f6fffc);
|
| 221 |
+
border-radius: 12px;
|
| 222 |
+
padding: 10px;
|
| 223 |
+
}}
|
| 224 |
+
.stat .k {{ font-size: 11px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.06em; }}
|
| 225 |
+
.stat .v {{ font-size: 22px; font-weight: 700; }}
|
| 226 |
+
.layout {{ display: grid; grid-template-columns: 1.2fr 3fr 1.2fr; gap: 14px; margin-bottom: 14px; }}
|
| 227 |
+
.control-col {{ display: flex; flex-direction: column; gap: 14px; }}
|
| 228 |
+
.control-grid {{ display: grid; gap: 8px; }}
|
| 229 |
+
.graph-wrap {{ position: relative; overflow: hidden; }}
|
| 230 |
+
.graph {{ height: 540px; border: 1px solid var(--line); border-radius: 14px; background: #fbfdff; }}
|
| 231 |
+
.graph-banner {{
|
| 232 |
+
position: absolute;
|
| 233 |
+
top: 10px;
|
| 234 |
+
left: 10px;
|
| 235 |
+
background: rgba(255,255,255,0.93);
|
| 236 |
+
border: 1px solid var(--line);
|
| 237 |
+
border-radius: 12px;
|
| 238 |
+
padding: 6px 10px;
|
| 239 |
+
font-size: 12px;
|
| 240 |
+
z-index: 2;
|
| 241 |
+
backdrop-filter: blur(4px);
|
| 242 |
+
}}
|
| 243 |
+
.legend {{ display: flex; gap: 8px; flex-wrap: wrap; margin-top: 8px; font-size: 12px; }}
|
| 244 |
+
.dot {{ width: 9px; height: 9px; border-radius: 999px; display: inline-block; margin-right: 4px; }}
|
| 245 |
+
.mono {{ font-family: \"IBM Plex Mono\", monospace; font-size: 12px; }}
|
| 246 |
+
.inline {{ display: flex; gap: 8px; align-items: center; }}
|
| 247 |
+
.split {{ display: grid; grid-template-columns: 2fr 1.3fr; gap: 14px; margin-bottom: 14px; }}
|
| 248 |
+
.db-tabs {{ display: flex; gap: 6px; flex-wrap: wrap; margin-bottom: 8px; }}
|
| 249 |
+
.tab {{
|
| 250 |
+
border: 1px solid var(--line);
|
| 251 |
+
border-radius: 9px;
|
| 252 |
+
padding: 5px 10px;
|
| 253 |
+
background: #fff;
|
| 254 |
+
cursor: pointer;
|
| 255 |
+
font-size: 12px;
|
| 256 |
+
}}
|
| 257 |
+
.tab.active {{ background: var(--brand-soft); border-color: #b5e7de; color: #08554e; }}
|
| 258 |
+
.table-wrap {{ max-height: 320px; overflow: auto; border: 1px solid var(--line); border-radius: 12px; }}
|
| 259 |
+
table {{ width: 100%; border-collapse: collapse; font-size: 12.5px; }}
|
| 260 |
+
th, td {{ padding: 8px; border-bottom: 1px solid #edf2f7; text-align: left; vertical-align: top; }}
|
| 261 |
+
th {{ position: sticky; top: 0; background: #f7fbff; z-index: 1; }}
|
| 262 |
+
tr:hover td {{ background: #f9fcff; }}
|
| 263 |
+
.json-view {{
|
| 264 |
+
height: 320px;
|
| 265 |
+
overflow: auto;
|
| 266 |
+
border: 1px solid var(--line);
|
| 267 |
+
border-radius: 12px;
|
| 268 |
+
background: #0f172a;
|
| 269 |
+
color: #d2f8ee;
|
| 270 |
+
padding: 10px;
|
| 271 |
+
margin: 0;
|
| 272 |
+
}}
|
| 273 |
+
.charts {{ display: grid; grid-template-columns: 1fr 1fr; gap: 14px; margin-bottom: 14px; }}
|
| 274 |
+
.chart-box {{ height: 300px; }}
|
| 275 |
+
select, input[type=\"search\"], button {{
|
| 276 |
+
border: 1px solid var(--line);
|
| 277 |
+
border-radius: 9px;
|
| 278 |
+
padding: 8px;
|
| 279 |
+
font: inherit;
|
| 280 |
+
background: #fff;
|
| 281 |
+
color: var(--ink);
|
| 282 |
+
}}
|
| 283 |
+
button {{ cursor: pointer; background: #fff; }}
|
| 284 |
+
button.primary {{ background: var(--brand); border-color: #0e6f68; color: #fff; }}
|
| 285 |
+
.subtle {{ background: #f7fafc; }}
|
| 286 |
+
@media (max-width: 1100px) {{
|
| 287 |
+
.hero, .layout, .split, .charts {{ grid-template-columns: 1fr; }}
|
| 288 |
+
.graph {{ height: 440px; }}
|
| 289 |
+
}}
|
| 290 |
+
</style>
|
| 291 |
+
</head>
|
| 292 |
+
<body>
|
| 293 |
+
<div class=\"wrap\">
|
| 294 |
+
<div class=\"hero\">
|
| 295 |
+
<section class=\"card hero-main\">
|
| 296 |
+
<h1>OSINT Benchmark Dashboard</h1>
|
| 297 |
+
<p class=\"muted\">Interactive explorer for canonical knowledge graph, episode traces, source platform records, and benchmark ranking.</p>
|
| 298 |
+
<div class=\"pill-row\" id=\"hero-pills\"></div>
|
| 299 |
+
<div class=\"stats\" id=\"stats\"></div>
|
| 300 |
+
</section>
|
| 301 |
+
<section class=\"card\">
|
| 302 |
+
<h2>Latest Task Snapshot</h2>
|
| 303 |
+
<div><strong>Task ID:</strong> <span id=\"task-id\"></span></div>
|
| 304 |
+
<div><strong>Task Type:</strong> <span id=\"task-type\"></span></div>
|
| 305 |
+
<div style=\"margin-top:8px\"><strong>Question</strong></div>
|
| 306 |
+
<div id=\"task-question\" class=\"muted\"></div>
|
| 307 |
+
<div style=\"margin-top:8px\"><strong>Answer</strong>: <span id=\"task-answer\"></span></div>
|
| 308 |
+
</section>
|
| 309 |
+
</div>
|
| 310 |
+
|
| 311 |
+
<div class=\"layout\">
|
| 312 |
+
<section class=\"card control-col\">
|
| 313 |
+
<div>
|
| 314 |
+
<h2>Graph Controls</h2>
|
| 315 |
+
<div class=\"control-grid\">
|
| 316 |
+
<label class=\"mono\" for=\"graph-mode\">Graph Layer</label>
|
| 317 |
+
<select id=\"graph-mode\">
|
| 318 |
+
<option value=\"canonical\">Canonical Graph</option>
|
| 319 |
+
<option value=\"episode\">Episode Graph</option>
|
| 320 |
+
</select>
|
| 321 |
+
<label class=\"mono\" for=\"graph-search\">Node Search</label>
|
| 322 |
+
<input id=\"graph-search\" type=\"search\" placeholder=\"Type node id or label...\" />
|
| 323 |
+
<label class=\"mono\" for=\"relation-filter\">Relation Filter</label>
|
| 324 |
+
<input id=\"relation-filter\" type=\"search\" placeholder=\"Filter edge labels...\" />
|
| 325 |
+
<button id=\"fit-graph\" class=\"primary\">Fit Graph</button>
|
| 326 |
+
</div>
|
| 327 |
+
</div>
|
| 328 |
+
<div>
|
| 329 |
+
<h2>Node Types</h2>
|
| 330 |
+
<div id=\"type-filters\" class=\"control-grid mono\"></div>
|
| 331 |
+
</div>
|
| 332 |
+
</section>
|
| 333 |
+
|
| 334 |
+
<section class=\"card\">
|
| 335 |
+
<h2>Graph Explorer</h2>
|
| 336 |
+
<div class=\"graph-wrap\">
|
| 337 |
+
<div class=\"graph-banner\" id=\"graph-banner\">Layer: Canonical Graph</div>
|
| 338 |
+
<div id=\"graph-canvas\" class=\"graph\"></div>
|
| 339 |
+
</div>
|
| 340 |
+
<div class=\"legend\">
|
| 341 |
+
<span><span class=\"dot\" style=\"background:#16a34a\"></span>matched edge</span>
|
| 342 |
+
<span><span class=\"dot\" style=\"background:#2563eb\"></span>predicted only</span>
|
| 343 |
+
<span><span class=\"dot\" style=\"background:#f59e0b\"></span>truth only</span>
|
| 344 |
+
</div>
|
| 345 |
+
</section>
|
| 346 |
+
|
| 347 |
+
<section class=\"card control-col\">
|
| 348 |
+
<div>
|
| 349 |
+
<h2>Node Inspector</h2>
|
| 350 |
+
<pre id=\"node-detail\" class=\"json-view\">Click a node to inspect attributes and neighbors.</pre>
|
| 351 |
+
</div>
|
| 352 |
+
<div>
|
| 353 |
+
<h2>Edge Inspector</h2>
|
| 354 |
+
<pre id=\"edge-detail\" class=\"json-view\">Click an edge to inspect relation details.</pre>
|
| 355 |
+
</div>
|
| 356 |
+
</section>
|
| 357 |
+
</div>
|
| 358 |
+
|
| 359 |
+
<div class=\"split\">
|
| 360 |
+
<section class=\"card\">
|
| 361 |
+
<h2>Original Database Explorer</h2>
|
| 362 |
+
<div class=\"db-tabs\" id=\"db-tabs\"></div>
|
| 363 |
+
<div class=\"inline\" style=\"margin-bottom:8px\">
|
| 364 |
+
<input id=\"db-search\" type=\"search\" placeholder=\"Search records...\" style=\"flex:1\" />
|
| 365 |
+
<select id=\"db-limit\">
|
| 366 |
+
<option value=\"200\">200</option>
|
| 367 |
+
<option value=\"500\">500</option>
|
| 368 |
+
<option value=\"1000\">1000</option>
|
| 369 |
+
</select>
|
| 370 |
+
</div>
|
| 371 |
+
<div class=\"table-wrap\"><table id=\"db-table\"></table></div>
|
| 372 |
+
</section>
|
| 373 |
+
|
| 374 |
+
<section class=\"card\">
|
| 375 |
+
<h2>Selected Source Record</h2>
|
| 376 |
+
<pre id=\"db-detail\" class=\"json-view\">Click a row in the database table to inspect full JSON.</pre>
|
| 377 |
+
</section>
|
| 378 |
+
</div>
|
| 379 |
+
|
| 380 |
+
<div class=\"charts\">
|
| 381 |
+
<section class=\"card\">
|
| 382 |
+
<h2>Benchmark Summary Radar</h2>
|
| 383 |
+
<div class=\"chart-box\"><canvas id=\"summary-chart\"></canvas></div>
|
| 384 |
+
</section>
|
| 385 |
+
<section class=\"card\">
|
| 386 |
+
<h2>Episode Reward and Graph F1</h2>
|
| 387 |
+
<div class=\"chart-box\"><canvas id=\"trace-chart\"></canvas></div>
|
| 388 |
+
</section>
|
| 389 |
+
</div>
|
| 390 |
+
|
| 391 |
+
<section class=\"card\">
|
| 392 |
+
<h2>Benchmark Leaderboard</h2>
|
| 393 |
+
<div class=\"inline\" style=\"margin-bottom:8px\">
|
| 394 |
+
<label class=\"mono\" for=\"leader-sort\">Sort by</label>
|
| 395 |
+
<select id=\"leader-sort\" class=\"subtle\">
|
| 396 |
+
<option value=\"leaderboard_score\">leaderboard_score</option>
|
| 397 |
+
<option value=\"task_success_rate\">task_success_rate</option>
|
| 398 |
+
<option value=\"avg_graph_f1\">avg_graph_f1</option>
|
| 399 |
+
<option value=\"retrieval_signal\">retrieval_signal</option>
|
| 400 |
+
<option value=\"structural_signal\">structural_signal</option>
|
| 401 |
+
<option value=\"spawn_signal\">spawn_signal</option>
|
| 402 |
+
<option value=\"avg_reward\">avg_reward</option>
|
| 403 |
+
</select>
|
| 404 |
+
</div>
|
| 405 |
+
<div class=\"table-wrap\"><table id=\"leaderboard-table\"></table></div>
|
| 406 |
+
</section>
|
| 407 |
+
</div>
|
| 408 |
+
|
| 409 |
+
<script>
|
| 410 |
+
const payload = {json.dumps(payload)};
|
| 411 |
+
|
| 412 |
+
function metricCards(summary) {{
|
| 413 |
+
const selected = [
|
| 414 |
+
["leaderboard_score", summary.leaderboard_score || 0],
|
| 415 |
+
["task_success_rate", summary.task_success_rate || 0],
|
| 416 |
+
["avg_graph_f1", summary.avg_graph_f1 || 0],
|
| 417 |
+
["retrieval_signal", summary.retrieval_signal || 0],
|
| 418 |
+
["structural_signal", summary.structural_signal || 0],
|
| 419 |
+
["tool_efficiency", summary.tool_efficiency || 0],
|
| 420 |
+
["avg_reward", summary.avg_reward || 0]
|
| 421 |
+
];
|
| 422 |
+
const root = document.getElementById("stats");
|
| 423 |
+
root.innerHTML = "";
|
| 424 |
+
selected.forEach(([k, v]) => {{
|
| 425 |
+
const div = document.createElement("div");
|
| 426 |
+
div.className = "stat";
|
| 427 |
+
div.innerHTML = `<div class=\"k\">${{k}}</div><div class=\"v\">${{Number(v).toFixed(3)}}</div>`;
|
| 428 |
+
root.appendChild(div);
|
| 429 |
+
}});
|
| 430 |
+
|
| 431 |
+
const pillRow = document.getElementById("hero-pills");
|
| 432 |
+
pillRow.innerHTML = "";
|
| 433 |
+
[
|
| 434 |
+
`deanonymization: ${{Number(summary.deanonymization_accuracy || 0).toFixed(3)}}`,
|
| 435 |
+
`avg steps: ${{Number(summary.avg_steps_to_solution || 0).toFixed(2)}}`,
|
| 436 |
+
`episodes: ${{(payload.episodes || []).length}}`
|
| 437 |
+
].forEach((text) => {{
|
| 438 |
+
const span = document.createElement("span");
|
| 439 |
+
span.className = "pill";
|
| 440 |
+
span.textContent = text;
|
| 441 |
+
pillRow.appendChild(span);
|
| 442 |
+
}});
|
| 443 |
+
}}
|
| 444 |
+
|
| 445 |
+
function buildTypeFilters(allGroups) {{
|
| 446 |
+
const root = document.getElementById("type-filters");
|
| 447 |
+
root.innerHTML = "";
|
| 448 |
+
allGroups.forEach((group) => {{
|
| 449 |
+
const id = `type_${{group}}`;
|
| 450 |
+
const row = document.createElement("label");
|
| 451 |
+
row.className = "inline";
|
| 452 |
+
row.innerHTML = `<input type=\"checkbox\" id=\"${{id}}\" value=\"${{group}}\" checked /> <span>${{group}}</span>`;
|
| 453 |
+
root.appendChild(row);
|
| 454 |
+
}});
|
| 455 |
+
}}
|
| 456 |
+
|
| 457 |
+
function createNetworkController() {{
|
| 458 |
+
const container = document.getElementById("graph-canvas");
|
| 459 |
+
const banner = document.getElementById("graph-banner");
|
| 460 |
+
const modeSelect = document.getElementById("graph-mode");
|
| 461 |
+
const nodeSearch = document.getElementById("graph-search");
|
| 462 |
+
const relFilter = document.getElementById("relation-filter");
|
| 463 |
+
const fitBtn = document.getElementById("fit-graph");
|
| 464 |
+
|
| 465 |
+
const rawLayers = {{
|
| 466 |
+
canonical: payload.canonical_graph || {{ nodes: [], edges: [] }},
|
| 467 |
+
episode: payload.episode_graph || {{ nodes: [], edges: [] }}
|
| 468 |
+
}};
|
| 469 |
+
|
| 470 |
+
const allGroups = Array.from(new Set((rawLayers.canonical.nodes || []).map(n => n.group || "unknown"))).sort();
|
| 471 |
+
buildTypeFilters(allGroups);
|
| 472 |
+
|
| 473 |
+
const state = {{
|
| 474 |
+
mode: "canonical",
|
| 475 |
+
relationQuery: "",
|
| 476 |
+
nodeQuery: "",
|
| 477 |
+
}};
|
| 478 |
+
|
| 479 |
+
const nodesDS = new vis.DataSet([]);
|
| 480 |
+
const edgesDS = new vis.DataSet([]);
|
| 481 |
+
const network = new vis.Network(container, {{ nodes: nodesDS, edges: edgesDS }}, {{
|
| 482 |
+
interaction: {{ hover: true, navigationButtons: true, keyboard: true }},
|
| 483 |
+
physics: {{ stabilization: false, barnesHut: {{ springLength: 130 }} }},
|
| 484 |
+
edges: {{ smooth: true, font: {{ size: 10 }} }},
|
| 485 |
+
nodes: {{ shape: "dot", size: 11, font: {{ size: 10 }} }}
|
| 486 |
+
}});
|
| 487 |
+
|
| 488 |
+
function activeGroups() {{
|
| 489 |
+
const checked = Array.from(document.querySelectorAll('#type-filters input[type="checkbox"]:checked'));
|
| 490 |
+
return new Set(checked.map(x => x.value));
|
| 491 |
+
}}
|
| 492 |
+
|
| 493 |
+
function styleNode(node, query) {{
|
| 494 |
+
const text = `${{node.id}} ${{node.label || ""}}`.toLowerCase();
|
| 495 |
+
const hit = query && text.includes(query);
|
| 496 |
+
return {{
|
| 497 |
+
...node,
|
| 498 |
+
color: hit ? "#f59e0b" : undefined,
|
| 499 |
+
size: hit ? 18 : 11,
|
| 500 |
+
}};
|
| 501 |
+
}}
|
| 502 |
+
|
| 503 |
+
function refresh() {{
|
| 504 |
+
const raw = rawLayers[state.mode] || {{ nodes: [], edges: [] }};
|
| 505 |
+
const groups = activeGroups();
|
| 506 |
+
const relQ = state.relationQuery.toLowerCase();
|
| 507 |
+
const nodeQ = state.nodeQuery.toLowerCase();
|
| 508 |
+
|
| 509 |
+
const nodes = (raw.nodes || []).filter(n => groups.has(n.group || "unknown")).map(n => styleNode(n, nodeQ));
|
| 510 |
+
const nodeIds = new Set(nodes.map(n => n.id));
|
| 511 |
+
const edges = (raw.edges || []).filter(e => nodeIds.has(e.from) && nodeIds.has(e.to)).filter(e => !relQ || String(e.label || "").toLowerCase().includes(relQ));
|
| 512 |
+
|
| 513 |
+
nodesDS.clear();
|
| 514 |
+
edgesDS.clear();
|
| 515 |
+
nodesDS.add(nodes);
|
| 516 |
+
edgesDS.add(edges);
|
| 517 |
+
|
| 518 |
+
banner.textContent = state.mode === "canonical" ? "Layer: Canonical Graph" : "Layer: Episode Graph";
|
| 519 |
+
}}
|
| 520 |
+
|
| 521 |
+
modeSelect.addEventListener("change", () => {{
|
| 522 |
+
state.mode = modeSelect.value;
|
| 523 |
+
refresh();
|
| 524 |
+
}});
|
| 525 |
+
relFilter.addEventListener("input", () => {{
|
| 526 |
+
state.relationQuery = relFilter.value || "";
|
| 527 |
+
refresh();
|
| 528 |
+
}});
|
| 529 |
+
nodeSearch.addEventListener("input", () => {{
|
| 530 |
+
state.nodeQuery = nodeSearch.value || "";
|
| 531 |
+
refresh();
|
| 532 |
+
}});
|
| 533 |
+
fitBtn.addEventListener("click", () => network.fit({{ animation: true }}));
|
| 534 |
+
document.getElementById("type-filters").addEventListener("change", refresh);
|
| 535 |
+
|
| 536 |
+
network.on("click", (params) => {{
|
| 537 |
+
if (params.nodes && params.nodes.length) {{
|
| 538 |
+
const node = nodesDS.get(params.nodes[0]);
|
| 539 |
+
const connected = network.getConnectedNodes(node.id) || [];
|
| 540 |
+
document.getElementById("node-detail").textContent = JSON.stringify({{
|
| 541 |
+
node,
|
| 542 |
+
connected_nodes: connected
|
| 543 |
+
}}, null, 2);
|
| 544 |
+
}}
|
| 545 |
+
if (params.edges && params.edges.length) {{
|
| 546 |
+
const edge = edgesDS.get(params.edges[0]);
|
| 547 |
+
document.getElementById("edge-detail").textContent = JSON.stringify(edge, null, 2);
|
| 548 |
+
}}
|
| 549 |
+
}});
|
| 550 |
+
|
| 551 |
+
refresh();
|
| 552 |
+
}}
|
| 553 |
+
|
| 554 |
+
function buildRows(views) {{
|
| 555 |
+
const rows = [];
|
| 556 |
+
(views.microblog_posts || []).forEach((x) => rows.push({{ source: "microblog", id: x.post_id || "post", text: JSON.stringify(x), raw: x }}));
|
| 557 |
+
(views.forum_threads || []).forEach((x) => rows.push({{ source: "forum", id: x.thread_id || "thread", text: JSON.stringify(x), raw: x }}));
|
| 558 |
+
(views.profiles || []).forEach((x) => rows.push({{ source: "profile", id: x.user_id || "profile", text: JSON.stringify(x), raw: x }}));
|
| 559 |
+
return rows;
|
| 560 |
+
}}
|
| 561 |
+
|
| 562 |
+
function initDatabaseExplorer() {{
|
| 563 |
+
const rows = buildRows(payload.views || {{}});
|
| 564 |
+
const tabs = document.getElementById("db-tabs");
|
| 565 |
+
const search = document.getElementById("db-search");
|
| 566 |
+
const limit = document.getElementById("db-limit");
|
| 567 |
+
const table = document.getElementById("db-table");
|
| 568 |
+
const detail = document.getElementById("db-detail");
|
| 569 |
+
|
| 570 |
+
const sources = ["all", "microblog", "forum", "profile"];
|
| 571 |
+
const state = {{ source: "all", query: "", limit: 200 }};
|
| 572 |
+
|
| 573 |
+
tabs.innerHTML = "";
|
| 574 |
+
sources.forEach((src) => {{
|
| 575 |
+
const btn = document.createElement("button");
|
| 576 |
+
btn.className = `tab ${{src === state.source ? "active" : ""}}`;
|
| 577 |
+
btn.textContent = src;
|
| 578 |
+
btn.addEventListener("click", () => {{
|
| 579 |
+
state.source = src;
|
| 580 |
+
Array.from(tabs.children).forEach((child) => child.classList.remove("active"));
|
| 581 |
+
btn.classList.add("active");
|
| 582 |
+
render();
|
| 583 |
+
}});
|
| 584 |
+
tabs.appendChild(btn);
|
| 585 |
+
}});
|
| 586 |
+
|
| 587 |
+
function filtered() {{
|
| 588 |
+
const q = state.query.toLowerCase();
|
| 589 |
+
return rows
|
| 590 |
+
.filter((row) => state.source === "all" || row.source === state.source)
|
| 591 |
+
.filter((row) => !q || row.text.toLowerCase().includes(q) || row.id.toLowerCase().includes(q));
|
| 592 |
+
}}
|
| 593 |
+
|
| 594 |
+
function render() {{
|
| 595 |
+
const show = filtered().slice(0, state.limit);
|
| 596 |
+
table.innerHTML = "<thead><tr><th>source</th><th>id</th><th>preview</th></tr></thead>";
|
| 597 |
+
const body = document.createElement("tbody");
|
| 598 |
+
show.forEach((row) => {{
|
| 599 |
+
const tr = document.createElement("tr");
|
| 600 |
+
const preview = row.text.length > 120 ? `${{row.text.slice(0, 120)}}...` : row.text;
|
| 601 |
+
tr.innerHTML = `<td>${{row.source}}</td><td class=\"mono\">${{row.id}}</td><td>${{preview}}</td>`;
|
| 602 |
+
tr.addEventListener("click", () => {{
|
| 603 |
+
detail.textContent = JSON.stringify(row.raw, null, 2);
|
| 604 |
+
}});
|
| 605 |
+
body.appendChild(tr);
|
| 606 |
+
}});
|
| 607 |
+
table.appendChild(body);
|
| 608 |
+
}}
|
| 609 |
+
|
| 610 |
+
search.addEventListener("input", () => {{ state.query = search.value || ""; render(); }});
|
| 611 |
+
limit.addEventListener("change", () => {{ state.limit = Number(limit.value || 200); render(); }});
|
| 612 |
+
render();
|
| 613 |
+
}}
|
| 614 |
+
|
| 615 |
+
function renderLeaderboard(records, sortBy = "leaderboard_score") {{
|
| 616 |
+
const sorted = [...records].sort((a, b) => (b.metrics?.[sortBy] || 0) - (a.metrics?.[sortBy] || 0));
|
| 617 |
+
const table = document.getElementById("leaderboard-table");
|
| 618 |
+
table.innerHTML = "<thead><tr><th>rank</th><th>run</th><th>score</th><th>success</th><th>graph_f1</th><th>retrieval</th><th>structural</th><th>spawn</th><th>reward</th></tr></thead>";
|
| 619 |
+
const body = document.createElement("tbody");
|
| 620 |
+
sorted.forEach((rec, i) => {{
|
| 621 |
+
const m = rec.metrics || {{}};
|
| 622 |
+
const tr = document.createElement("tr");
|
| 623 |
+
tr.innerHTML = `<td>${{i + 1}}</td><td>${{rec.run_name || rec.run_id || "run"}}</td><td>${{(m.leaderboard_score || 0).toFixed(4)}}</td><td>${{(m.task_success_rate || 0).toFixed(3)}}</td><td>${{(m.avg_graph_f1 || 0).toFixed(3)}}</td><td>${{(m.retrieval_signal || 0).toFixed(3)}}</td><td>${{(m.structural_signal || 0).toFixed(3)}}</td><td>${{(m.spawn_signal || 0).toFixed(3)}}</td><td>${{(m.avg_reward || 0).toFixed(3)}}</td>`;
|
| 624 |
+
body.appendChild(tr);
|
| 625 |
+
}});
|
| 626 |
+
table.appendChild(body);
|
| 627 |
+
}}
|
| 628 |
+
|
| 629 |
+
function drawSummaryChart(summary) {{
|
| 630 |
+
const labels = ["success", "graph_f1", "tool_eff", "deanon", "retrieval", "structural", "score"];
|
| 631 |
+
const values = [
|
| 632 |
+
summary.task_success_rate || 0,
|
| 633 |
+
summary.avg_graph_f1 || 0,
|
| 634 |
+
summary.tool_efficiency || 0,
|
| 635 |
+
summary.deanonymization_accuracy || 0,
|
| 636 |
+
summary.retrieval_signal || 0,
|
| 637 |
+
summary.structural_signal || 0,
|
| 638 |
+
summary.leaderboard_score || 0,
|
| 639 |
+
];
|
| 640 |
+
new Chart(document.getElementById("summary-chart"), {{
|
| 641 |
+
type: "radar",
|
| 642 |
+
data: {{
|
| 643 |
+
labels,
|
| 644 |
+
datasets: [{{
|
| 645 |
+
label: "normalized metrics",
|
| 646 |
+
data: values,
|
| 647 |
+
backgroundColor: "rgba(15,118,110,0.2)",
|
| 648 |
+
borderColor: "#0f766e",
|
| 649 |
+
pointBackgroundColor: "#d97706",
|
| 650 |
+
pointRadius: 3
|
| 651 |
+
}}]
|
| 652 |
+
}},
|
| 653 |
+
options: {{ responsive: true, maintainAspectRatio: false, scales: {{ r: {{ min: 0, max: 1 }} }} }}
|
| 654 |
+
}});
|
| 655 |
+
}}
|
| 656 |
+
|
| 657 |
+
function drawTraceChart(episodes) {{
|
| 658 |
+
const labels = episodes.map((_, i) => `ep_${{i + 1}}`);
|
| 659 |
+
const rewards = episodes.map(e => e.reward || 0);
|
| 660 |
+
const f1 = episodes.map(e => e.graph_f1 || 0);
|
| 661 |
+
new Chart(document.getElementById("trace-chart"), {{
|
| 662 |
+
type: "line",
|
| 663 |
+
data: {{
|
| 664 |
+
labels,
|
| 665 |
+
datasets: [
|
| 666 |
+
{{ label: "reward", data: rewards, borderColor: "#0f766e", yAxisID: "y", tension: 0.2 }},
|
| 667 |
+
{{ label: "graph_f1", data: f1, borderColor: "#d97706", yAxisID: "y1", tension: 0.2 }}
|
| 668 |
+
]
|
| 669 |
+
}},
|
| 670 |
+
options: {{
|
| 671 |
+
responsive: true,
|
| 672 |
+
maintainAspectRatio: false,
|
| 673 |
+
scales: {{
|
| 674 |
+
y: {{ position: "left" }},
|
| 675 |
+
y1: {{ position: "right", min: 0, max: 1, grid: {{ drawOnChartArea: false }} }}
|
| 676 |
+
}}
|
| 677 |
+
}}
|
| 678 |
+
}});
|
| 679 |
+
}}
|
| 680 |
+
|
| 681 |
+
const summary = payload.summary || {{}};
|
| 682 |
+
metricCards(summary);
|
| 683 |
+
|
| 684 |
+
document.getElementById("task-id").textContent = payload.task.task_id;
|
| 685 |
+
document.getElementById("task-type").textContent = payload.task.task_type;
|
| 686 |
+
document.getElementById("task-question").textContent = payload.task.question;
|
| 687 |
+
document.getElementById("task-answer").textContent = payload.task.answer;
|
| 688 |
+
|
| 689 |
+
createNetworkController();
|
| 690 |
+
initDatabaseExplorer();
|
| 691 |
+
|
| 692 |
+
const leaderboard = payload.leaderboard || [];
|
| 693 |
+
const leaderSort = document.getElementById("leader-sort");
|
| 694 |
+
renderLeaderboard(leaderboard, leaderSort.value);
|
| 695 |
+
leaderSort.addEventListener("change", () => renderLeaderboard(leaderboard, leaderSort.value));
|
| 696 |
+
|
| 697 |
+
drawSummaryChart(summary);
|
| 698 |
+
drawTraceChart(payload.episodes || []);
|
| 699 |
+
</script>
|
| 700 |
+
</body>
|
| 701 |
+
</html>
|
| 702 |
+
"""
|
| 703 |
+
|
| 704 |
+
out = Path(output_path)
|
| 705 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
| 706 |
+
out.write_text(html, encoding="utf-8")
|
| 707 |
+
return str(out)
|
tests/test_config.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from osint_env.config.shared import load_seeding_config, load_shared_config
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_shared_config_defaults_when_file_missing():
|
| 8 |
+
config = load_shared_config("/tmp/does_not_exist_for_osint_config.json")
|
| 9 |
+
assert config.environment.max_steps > 0
|
| 10 |
+
assert config.runtime.default_episodes > 0
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_shared_config_parses_swarm_and_seeding(tmp_path: Path):
|
| 14 |
+
path = tmp_path / "shared.json"
|
| 15 |
+
path.write_text(
|
| 16 |
+
json.dumps(
|
| 17 |
+
{
|
| 18 |
+
"environment": {"seed": 19, "max_steps": 9},
|
| 19 |
+
"swarm": {"enabled": True, "max_agents": 3, "max_breadth": 2, "max_width": 2, "max_depth": 2},
|
| 20 |
+
"seeding": {
|
| 21 |
+
"seeded_questions": [
|
| 22 |
+
{
|
| 23 |
+
"question": "Which canonical user owns alias alias_seed_001?",
|
| 24 |
+
"answer": "user_seed_001",
|
| 25 |
+
}
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
"runtime": {"default_episodes": 5},
|
| 29 |
+
}
|
| 30 |
+
),
|
| 31 |
+
encoding="utf-8",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
config = load_shared_config(path)
|
| 35 |
+
assert config.environment.seed == 19
|
| 36 |
+
assert config.environment.swarm.enabled is True
|
| 37 |
+
assert config.environment.swarm.max_width == 2
|
| 38 |
+
assert len(config.environment.seeding.seeded_questions) == 1
|
| 39 |
+
assert config.runtime.default_episodes == 5
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_load_seeding_config_supports_top_level_object(tmp_path: Path):
|
| 43 |
+
path = tmp_path / "seeding.json"
|
| 44 |
+
path.write_text(
|
| 45 |
+
json.dumps(
|
| 46 |
+
{
|
| 47 |
+
"seeded_nodes": [
|
| 48 |
+
{"node_id": "alias_seed_1", "node_type": "alias", "attrs": {"handle": "@seed"}},
|
| 49 |
+
{"node_id": "user_seed_1", "node_type": "user", "attrs": {"name": "Seed"}},
|
| 50 |
+
],
|
| 51 |
+
"seeded_edges": [{"src": "alias_seed_1", "rel": "alias_of", "dst": "user_seed_1"}],
|
| 52 |
+
"seeded_questions": [{"question": "Which canonical user owns alias alias_seed_1?", "answer": "user_seed_1"}],
|
| 53 |
+
}
|
| 54 |
+
),
|
| 55 |
+
encoding="utf-8",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
seeding = load_seeding_config(path)
|
| 59 |
+
assert len(seeding.seeded_nodes) == 2
|
| 60 |
+
assert len(seeding.seeded_edges) == 1
|
| 61 |
+
assert seeding.seeded_questions[0].answer == "user_seed_1"
|
tests/test_dashboard.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from osint_env.domain.models import EnvironmentConfig
|
| 4 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 5 |
+
from osint_env.viz import export_dashboard
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_dashboard_export(tmp_path: Path):
|
| 9 |
+
env = OSINTEnvironment(EnvironmentConfig(seed=9, n_users=14))
|
| 10 |
+
env.reset()
|
| 11 |
+
|
| 12 |
+
out = tmp_path / "dashboard.html"
|
| 13 |
+
path = export_dashboard(
|
| 14 |
+
env=env,
|
| 15 |
+
evaluation={"summary": {"leaderboard_score": 0.0, "task_success_rate": 0.0, "avg_graph_f1": 0.0, "tool_efficiency": 0.0, "deanonymization_accuracy": 0.0, "avg_reward": 0.0}, "episodes": []},
|
| 16 |
+
leaderboard_records=[],
|
| 17 |
+
output_path=str(out),
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
assert path.endswith("dashboard.html")
|
| 21 |
+
text = out.read_text(encoding="utf-8")
|
| 22 |
+
assert "OSINT Benchmark Dashboard" in text
|
| 23 |
+
assert "Canonical Graph" in text
|
| 24 |
+
assert "Original Database Explorer" in text
|
| 25 |
+
assert "Benchmark Leaderboard" in text
|
tests/test_eval.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from osint_env.domain.models import EnvironmentConfig
|
| 2 |
from osint_env.env.environment import OSINTEnvironment
|
| 3 |
from osint_env.eval.runner import run_evaluation
|
| 4 |
|
|
@@ -8,3 +8,14 @@ def test_eval_runner():
|
|
| 8 |
result = run_evaluation(env, episodes=3)
|
| 9 |
assert "task_success_rate" in result
|
| 10 |
assert "deanonymization_accuracy" in result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.domain.models import EnvironmentConfig, SwarmConfig
|
| 2 |
from osint_env.env.environment import OSINTEnvironment
|
| 3 |
from osint_env.eval.runner import run_evaluation
|
| 4 |
|
|
|
|
| 8 |
result = run_evaluation(env, episodes=3)
|
| 9 |
assert "task_success_rate" in result
|
| 10 |
assert "deanonymization_accuracy" in result
|
| 11 |
+
assert "leaderboard_score" in result
|
| 12 |
+
assert "avg_knowledge_indexing_reward" in result
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_eval_runner_swarm_mode():
|
| 16 |
+
env = OSINTEnvironment(
|
| 17 |
+
EnvironmentConfig(seed=17, swarm=SwarmConfig(enabled=True, max_agents=3, max_breadth=2, max_width=2, max_depth=2))
|
| 18 |
+
)
|
| 19 |
+
result = run_evaluation(env, episodes=2)
|
| 20 |
+
assert "spawn_signal" in result
|
| 21 |
+
assert "avg_spawn_count" in result
|
tests/test_leaderboard.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from osint_env.eval.leaderboard import append_leaderboard_record, load_leaderboard, render_leaderboard_table, sorted_leaderboard
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_leaderboard_roundtrip(tmp_path: Path):
|
| 7 |
+
board = tmp_path / "leaderboard.json"
|
| 8 |
+
append_leaderboard_record(
|
| 9 |
+
path=board,
|
| 10 |
+
summary={
|
| 11 |
+
"leaderboard_score": 0.42,
|
| 12 |
+
"task_success_rate": 0.5,
|
| 13 |
+
"avg_graph_f1": 0.4,
|
| 14 |
+
"avg_reward": 0.1,
|
| 15 |
+
"tool_efficiency": 0.9,
|
| 16 |
+
"retrieval_signal": 0.3,
|
| 17 |
+
"structural_signal": 0.4,
|
| 18 |
+
},
|
| 19 |
+
episodes=5,
|
| 20 |
+
run_name="baseline",
|
| 21 |
+
)
|
| 22 |
+
append_leaderboard_record(
|
| 23 |
+
path=board,
|
| 24 |
+
summary={
|
| 25 |
+
"leaderboard_score": 0.75,
|
| 26 |
+
"task_success_rate": 0.7,
|
| 27 |
+
"avg_graph_f1": 0.6,
|
| 28 |
+
"avg_reward": 0.5,
|
| 29 |
+
"tool_efficiency": 0.8,
|
| 30 |
+
"retrieval_signal": 0.6,
|
| 31 |
+
"structural_signal": 0.7,
|
| 32 |
+
},
|
| 33 |
+
episodes=5,
|
| 34 |
+
run_name="improved",
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
records = load_leaderboard(board)
|
| 38 |
+
ranked = sorted_leaderboard(records)
|
| 39 |
+
assert len(records) == 2
|
| 40 |
+
assert ranked[0]["run_name"] == "improved"
|
| 41 |
+
|
| 42 |
+
ranked_by_success = sorted_leaderboard(records, sort_by="task_success_rate")
|
| 43 |
+
assert ranked_by_success[0]["run_name"] == "improved"
|
| 44 |
+
|
| 45 |
+
table = render_leaderboard_table(records, top_k=5)
|
| 46 |
+
assert "| rank | run |" in table
|
| 47 |
+
assert "retrieval" in table
|
tests/test_reward.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.domain.models import Edge, EnvironmentConfig
|
| 2 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 3 |
+
from osint_env.env.reward import build_reward_model, compute_answer_reward, compute_edge_reward
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_composite_edge_reward_returns_breakdown():
|
| 7 |
+
env = OSINTEnvironment(EnvironmentConfig(seed=13, n_users=16, max_steps=6))
|
| 8 |
+
obs = env.reset()
|
| 9 |
+
task = env.state.task
|
| 10 |
+
|
| 11 |
+
model = build_reward_model(env.graph)
|
| 12 |
+
edge = task.supporting_edges[0]
|
| 13 |
+
breakdown = compute_edge_reward(
|
| 14 |
+
edge=edge,
|
| 15 |
+
task=task,
|
| 16 |
+
existing_edges=[],
|
| 17 |
+
step_count=1,
|
| 18 |
+
model=model,
|
| 19 |
+
graph=env.graph,
|
| 20 |
+
)
|
| 21 |
+
assert isinstance(breakdown.total, float)
|
| 22 |
+
assert breakdown.global_accuracy > 0
|
| 23 |
+
assert isinstance(breakdown.connectivity_gain, float)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_answer_reward_uses_graph_and_tool_context():
|
| 27 |
+
env = OSINTEnvironment(EnvironmentConfig(seed=21, n_users=18, max_steps=6))
|
| 28 |
+
env.reset()
|
| 29 |
+
task = env.state.task
|
| 30 |
+
|
| 31 |
+
pred_edges = [Edge(task.supporting_edges[0].src, task.supporting_edges[0].rel, task.supporting_edges[0].dst)]
|
| 32 |
+
tool_outputs = [{"tool": "get_profile", "output": {"result": {"user_id": task.answer}}}]
|
| 33 |
+
|
| 34 |
+
good = compute_answer_reward(
|
| 35 |
+
proposed_answer=task.answer,
|
| 36 |
+
task=task,
|
| 37 |
+
pred_edges=pred_edges,
|
| 38 |
+
tool_outputs=tool_outputs,
|
| 39 |
+
step_count=2,
|
| 40 |
+
)
|
| 41 |
+
bad = compute_answer_reward(
|
| 42 |
+
proposed_answer="wrong",
|
| 43 |
+
task=task,
|
| 44 |
+
pred_edges=[],
|
| 45 |
+
tool_outputs=[],
|
| 46 |
+
step_count=2,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
assert good.total > bad.total
|
| 50 |
+
assert good.graph_f1 >= 0
|
| 51 |
+
assert isinstance(good.relation_informativeness, float)
|
| 52 |
+
assert isinstance(good.entity_informativeness, float)
|
| 53 |
+
assert isinstance(good.repetition_penalty, float)
|
tests/test_seeding.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.domain.models import (
|
| 2 |
+
EnvironmentConfig,
|
| 3 |
+
NodeType,
|
| 4 |
+
SeedEdgeSpec,
|
| 5 |
+
SeedNodeSpec,
|
| 6 |
+
SeedQuestionSpec,
|
| 7 |
+
SeedingConfig,
|
| 8 |
+
)
|
| 9 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_environment_includes_seeded_graph_and_questions():
|
| 13 |
+
seeding = SeedingConfig(
|
| 14 |
+
seeded_nodes=[
|
| 15 |
+
SeedNodeSpec(node_id="alias_seed_001", node_type=NodeType.ALIAS, attrs={"handle": "@seed001"}),
|
| 16 |
+
SeedNodeSpec(
|
| 17 |
+
node_id="user_seed_001",
|
| 18 |
+
node_type=NodeType.USER,
|
| 19 |
+
attrs={"name": "Seed User", "org": "Helios Labs", "location": "Pune"},
|
| 20 |
+
),
|
| 21 |
+
],
|
| 22 |
+
seeded_edges=[SeedEdgeSpec(src="alias_seed_001", rel="alias_of", dst="user_seed_001")],
|
| 23 |
+
seeded_questions=[
|
| 24 |
+
SeedQuestionSpec(
|
| 25 |
+
question="Which canonical user owns alias alias_seed_001?",
|
| 26 |
+
answer="user_seed_001",
|
| 27 |
+
task_type="identity_resolution",
|
| 28 |
+
supporting_edges=[SeedEdgeSpec(src="alias_seed_001", rel="alias_of", dst="user_seed_001")],
|
| 29 |
+
)
|
| 30 |
+
],
|
| 31 |
+
llm_generate_remaining_graph=False,
|
| 32 |
+
llm_generate_remaining_tasks=False,
|
| 33 |
+
llm_generated_edge_budget=0,
|
| 34 |
+
llm_generated_task_budget=0,
|
| 35 |
+
)
|
| 36 |
+
env = OSINTEnvironment(EnvironmentConfig(seed=33, n_users=12, seeding=seeding))
|
| 37 |
+
|
| 38 |
+
assert "alias_seed_001" in env.graph.nodes
|
| 39 |
+
assert any(edge.src == "alias_seed_001" and edge.rel == "alias_of" and edge.dst == "user_seed_001" for edge in env.graph.edges)
|
| 40 |
+
assert any("alias_seed_001" in task.question for task in env.tasks)
|
tests/test_spawn_reward_hooks.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.env.spawn_reward_hooks import critical_steps, parl_style_spawn_reward
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_critical_steps_matches_parallel_path_length():
|
| 5 |
+
total = critical_steps(main_steps=[1, 1, 1], parallel_subagent_steps=[[3, 2], [0], [4, 1, 2]])
|
| 6 |
+
assert total == 1 + 3 + 1 + 0 + 1 + 4
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_parl_reward_prefers_finished_parallel_work():
|
| 10 |
+
base = parl_style_spawn_reward(
|
| 11 |
+
task_outcome_reward=0.2,
|
| 12 |
+
spawn_count=4,
|
| 13 |
+
finished_subtasks=1,
|
| 14 |
+
critical_steps=12,
|
| 15 |
+
lambda_parallel=0.2,
|
| 16 |
+
lambda_finish=0.25,
|
| 17 |
+
anneal=1.0,
|
| 18 |
+
breadth=2,
|
| 19 |
+
depth=3,
|
| 20 |
+
)
|
| 21 |
+
better = parl_style_spawn_reward(
|
| 22 |
+
task_outcome_reward=0.2,
|
| 23 |
+
spawn_count=4,
|
| 24 |
+
finished_subtasks=4,
|
| 25 |
+
critical_steps=8,
|
| 26 |
+
lambda_parallel=0.2,
|
| 27 |
+
lambda_finish=0.25,
|
| 28 |
+
anneal=1.0,
|
| 29 |
+
breadth=4,
|
| 30 |
+
depth=2,
|
| 31 |
+
)
|
| 32 |
+
assert better > base
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_parl_auxiliary_can_be_annealed_out():
|
| 36 |
+
frozen = parl_style_spawn_reward(
|
| 37 |
+
task_outcome_reward=0.7,
|
| 38 |
+
spawn_count=8,
|
| 39 |
+
finished_subtasks=8,
|
| 40 |
+
critical_steps=5,
|
| 41 |
+
anneal=0.0,
|
| 42 |
+
)
|
| 43 |
+
assert frozen == 0.7
|
tests/test_swarm_agent.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.agents.swarm_agent import SwarmAgentRunner
|
| 2 |
+
from osint_env.domain.models import EnvironmentConfig, SwarmConfig
|
| 3 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_swarm_runner_emits_spawn_telemetry():
|
| 7 |
+
config = EnvironmentConfig(
|
| 8 |
+
seed=14,
|
| 9 |
+
max_steps=8,
|
| 10 |
+
swarm=SwarmConfig(enabled=True, max_agents=3, max_breadth=2, max_width=2, max_depth=2, planner_rounds=2),
|
| 11 |
+
)
|
| 12 |
+
env = OSINTEnvironment(config)
|
| 13 |
+
info = SwarmAgentRunner(env).run_episode()
|
| 14 |
+
|
| 15 |
+
assert info["spawn_count"] > 0
|
| 16 |
+
assert "spawn_auxiliary" in info["reward_components"]
|
| 17 |
+
assert info["spawn_critical_steps"] > 0
|