Spaces:
Paused
Paused
siddeshwar-kagatikar commited on
Commit ·
db4fa53
0
Parent(s):
Deploy clean snapshot to Hugging Face Space.
Browse filesPublish RL output-validity prompts, retry-backed task materialization, and graded reward shaping via clean-history force deploy required by HF push limits.
Made-with: Cursor
This view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +9 -0
- .gitattributes +35 -0
- .github/workflows/validation.yml +32 -0
- .gitignore +11 -0
- .tmp_compare/Meta-s-LedgerShield +1 -0
- Dockerfile +28 -0
- README.md +392 -0
- artifacts/leaderboard.json +453 -0
- config/seed_example.json +52 -0
- config/seed_ollama_smoke.json +51 -0
- config/self_play_training_example.json +105 -0
- config/self_play_training_hf_a10g_smoke.json +72 -0
- config/shared_config.json +63 -0
- config/shared_config_metaqa.json +63 -0
- datasets/fixed_levels/README.md +64 -0
- datasets/fixed_levels/complete_dataset_qwen_generated.json +0 -0
- datasets/fixed_levels/fixed_graph_questions.json +0 -0
- datasets/fixed_levels/leaderboard_fixed_levels.json +1401 -0
- datasets/fixed_levels/qwen_swarm_benchmark_fixed_levels.json +69 -0
- datasets/fixed_levels/qwen_swarm_eval_by_difficulty.json +53 -0
- datasets/fixed_levels/qwen_swarm_eval_fixed_levels.json +25 -0
- datasets/fixed_levels/seed_fixed_levels.json +0 -0
- datasets/fixed_levels/shared_config_fixed_levels.json +63 -0
- docs/adversarial_self_play.md +99 -0
- docs/reward_design_notes.md +94 -0
- inference.py +540 -0
- my_env_v4.py +46 -0
- openenv.yaml +66 -0
- pyproject.toml +45 -0
- requirements.txt +11 -0
- scripts/build_fixed_levels_dataset.py +197 -0
- scripts/generate_fixed_levels_seed.py +109 -0
- scripts/run_openai_baseline.py +59 -0
- scripts/space_start.sh +35 -0
- scripts/test_ollama_space.py +185 -0
- scripts/validate_release.py +21 -0
- server.py +564 -0
- server/app.py +26 -0
- src/osint_env/__init__.py +5 -0
- src/osint_env/agents/__init__.py +7 -0
- src/osint_env/agents/single_agent.py +41 -0
- src/osint_env/agents/swarm_agent.py +209 -0
- src/osint_env/api/__init__.py +19 -0
- src/osint_env/api/models.py +73 -0
- src/osint_env/baselines/__init__.py +4 -0
- src/osint_env/baselines/openai_runner.py +533 -0
- src/osint_env/cli.py +440 -0
- src/osint_env/config/__init__.py +9 -0
- src/osint_env/config/shared.py +279 -0
- src/osint_env/data/__init__.py +2 -0
.dockerignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.pytest_cache
|
| 3 |
+
__pycache__
|
| 4 |
+
*.pyc
|
| 5 |
+
*.pyo
|
| 6 |
+
*.pyd
|
| 7 |
+
.Python
|
| 8 |
+
artifacts
|
| 9 |
+
tests
|
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/validation.yml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Validation
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
pull_request:
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
validate:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- name: Checkout
|
| 13 |
+
uses: actions/checkout@v4
|
| 14 |
+
|
| 15 |
+
- name: Set up Python
|
| 16 |
+
uses: actions/setup-python@v5
|
| 17 |
+
with:
|
| 18 |
+
python-version: "3.12"
|
| 19 |
+
|
| 20 |
+
- name: Install project
|
| 21 |
+
run: |
|
| 22 |
+
python -m pip install --upgrade pip
|
| 23 |
+
python -m pip install -e .[dev]
|
| 24 |
+
|
| 25 |
+
- name: Run test suite
|
| 26 |
+
run: python -m pytest -q
|
| 27 |
+
|
| 28 |
+
- name: Run validation gate
|
| 29 |
+
run: python scripts/validate_release.py
|
| 30 |
+
|
| 31 |
+
- name: Build Docker image
|
| 32 |
+
run: docker build -t osint-openenv-validation .
|
.gitignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pyc
|
| 2 |
+
blueprint.txt
|
| 3 |
+
*.egg-info
|
| 4 |
+
artifacts/*
|
| 5 |
+
*.html
|
| 6 |
+
osint_dashboard.html
|
| 7 |
+
.venv/
|
| 8 |
+
.tmp_compare/
|
| 9 |
+
metaQA/
|
| 10 |
+
.codex
|
| 11 |
+
TODO.txt
|
.tmp_compare/Meta-s-LedgerShield
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit fd5c9b60ddfbd2eba9d09001938b63169ac98f7b
|
Dockerfile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
RUN useradd -m -u 1000 user
|
| 4 |
+
|
| 5 |
+
USER user
|
| 6 |
+
ENV HOME=/home/user \
|
| 7 |
+
PATH=/home/user/.local/bin:$PATH \
|
| 8 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 9 |
+
PYTHONUNBUFFERED=1 \
|
| 10 |
+
PORT=7860
|
| 11 |
+
|
| 12 |
+
WORKDIR $HOME/app
|
| 13 |
+
|
| 14 |
+
COPY --chown=user pyproject.toml README.md openenv.yaml inference.py $HOME/app/
|
| 15 |
+
COPY --chown=user src $HOME/app/src
|
| 16 |
+
COPY --chown=user config $HOME/app/config
|
| 17 |
+
COPY --chown=user datasets $HOME/app/datasets
|
| 18 |
+
COPY --chown=user docs $HOME/app/docs
|
| 19 |
+
COPY --chown=user scripts $HOME/app/scripts
|
| 20 |
+
COPY --chown=user server.py $HOME/app/server.py
|
| 21 |
+
|
| 22 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 23 |
+
pip install --no-cache-dir -e ".[train]" && \
|
| 24 |
+
chmod +x $HOME/app/scripts/space_start.sh
|
| 25 |
+
|
| 26 |
+
EXPOSE 7860
|
| 27 |
+
|
| 28 |
+
CMD ["sh", "/home/user/app/scripts/space_start.sh"]
|
README.md
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: OSINT OpenEnv
|
| 3 |
+
emoji: 🕵️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- osint
|
| 13 |
+
- benchmark
|
| 14 |
+
- docker
|
| 15 |
+
- fastapi
|
| 16 |
+
short_description: Docker OSINT benchmark with fixed OpenEnv tasks.
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
# OSINT OpenEnv
|
| 20 |
+
|
| 21 |
+
OSINT OpenEnv is a synthetic benchmark environment for tool-using agents that must recover identities, trace events, and link entities across noisy multi-platform records. The project is designed to feel like a compact OSINT workflow rather than a raw QA dataset: agents query mock profiles, posts, forum threads, and semantic memory, build a working graph, and then submit an answer.
|
| 22 |
+
|
| 23 |
+
The motivation is to provide a reproducible OpenEnv-compatible environment for evaluating graph-building and tool-using reasoning without depending on live web data, unstable APIs, or private corpora. That makes it useful for local development, regression testing, and hosted demos such as a Docker-based Hugging Face Space.
|
| 24 |
+
|
| 25 |
+
## Environment Summary
|
| 26 |
+
|
| 27 |
+
The environment generates or loads a hidden canonical graph of users, aliases, organizations, locations, posts, threads, and events. It then exposes partial platform views and a task list drawn from that graph.
|
| 28 |
+
|
| 29 |
+
The default hosted Space uses the fixed-level benchmark in `datasets/fixed_levels/seed_fixed_levels.json`, which now contains 30 stable tasks over a larger shared seeded graph.
|
| 30 |
+
|
| 31 |
+
The repository now supports two dataset backends:
|
| 32 |
+
|
| 33 |
+
- `canonical` (existing fixed-level synthetic graph pipeline)
|
| 34 |
+
- `metaqa` (MetaQA KB + QA files for `1-hop`, `2-hop`, and `3-hop`)
|
| 35 |
+
|
| 36 |
+
Use `config/shared_config.json` or CLI flags (`--dataset-mode`, `--metaqa-root`, `--metaqa-hops`, `--metaqa-splits`) to choose which backend to run.
|
| 37 |
+
|
| 38 |
+
## Action Space
|
| 39 |
+
|
| 40 |
+
The environment exposes three actions:
|
| 41 |
+
|
| 42 |
+
- `CALL_TOOL`: query platform views or semantic memory such as `search_posts`, `get_profile`, `search_threads`, `get_connections`, or `search_memory`.
|
| 43 |
+
- `ADD_EDGE`: add a candidate relation to the working memory graph.
|
| 44 |
+
- `ANSWER`: submit the final answer as an exact node id string.
|
| 45 |
+
|
| 46 |
+
## Observation Space
|
| 47 |
+
|
| 48 |
+
Each step returns a JSON observation with four parts:
|
| 49 |
+
|
| 50 |
+
- `tool_outputs`: the most recent tool results.
|
| 51 |
+
- `graph_snapshot`: the current working-memory graph edges.
|
| 52 |
+
- `action_history`: recent actions and rewards.
|
| 53 |
+
- `task`: the active task id, task type, and question.
|
| 54 |
+
|
| 55 |
+
## Task Types And Difficulty
|
| 56 |
+
|
| 57 |
+
The benchmark mixes direct lookups with multi-hop traces:
|
| 58 |
+
|
| 59 |
+
- Easy: single-hop identity resolution, organization lookup, event lookup, or location lookup.
|
| 60 |
+
- Mid: two-hop alias-to-user-to-organization or thread-to-event-to-user traces.
|
| 61 |
+
- High: cross-platform multi-hop traces combining aliases, authored content, event references, organization links, and direct connections.
|
| 62 |
+
|
| 63 |
+
In MetaQA mode, hop buckets are mapped into the same reward difficulty tiers:
|
| 64 |
+
|
| 65 |
+
- `1-hop` -> `easy`
|
| 66 |
+
- `2-hop` -> `medium`
|
| 67 |
+
- `3-hop` -> `hard`
|
| 68 |
+
|
| 69 |
+
Common task families include:
|
| 70 |
+
|
| 71 |
+
- `identity_resolution`
|
| 72 |
+
- `network_discovery`
|
| 73 |
+
- `event_tracing`
|
| 74 |
+
- `cross_platform_linking`
|
| 75 |
+
- `deanonymization`
|
| 76 |
+
- `convoluted_trace`
|
| 77 |
+
|
| 78 |
+
Expected difficulty increases with the number of relations the agent must chain together and whether the evidence is split across posts, threads, aliases, and profile edges.
|
| 79 |
+
|
| 80 |
+
## Repository Layout
|
| 81 |
+
|
| 82 |
+
```text
|
| 83 |
+
src/osint_env/
|
| 84 |
+
agents/ single-agent and swarm runners
|
| 85 |
+
baselines/ reusable OpenAI baseline runner
|
| 86 |
+
config/ shared config and seed loading
|
| 87 |
+
data/ graph/view/task generation
|
| 88 |
+
domain/ dataclasses and environment models
|
| 89 |
+
env/ environment, reward logic, OpenEnv compatibility shim
|
| 90 |
+
eval/ evaluation metrics and leaderboard helpers
|
| 91 |
+
llm/ mock, Ollama, and OpenAI client wrappers
|
| 92 |
+
memory/ working graph and semantic memory
|
| 93 |
+
platforms/ tool APIs over synthetic platform views
|
| 94 |
+
viz/ dashboard export
|
| 95 |
+
|
| 96 |
+
scripts/
|
| 97 |
+
build_fixed_levels_dataset.py
|
| 98 |
+
run_openai_baseline.py
|
| 99 |
+
|
| 100 |
+
datasets/fixed_levels/
|
| 101 |
+
seed_fixed_levels.json
|
| 102 |
+
shared_config_fixed_levels.json
|
| 103 |
+
qwen_swarm_benchmark_fixed_levels.json
|
| 104 |
+
|
| 105 |
+
server.py FastAPI app for local use and Docker/HF Spaces
|
| 106 |
+
Dockerfile Container entrypoint for Hugging Face Docker Spaces
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
## Setup
|
| 110 |
+
|
| 111 |
+
Python 3.10+ is required.
|
| 112 |
+
|
| 113 |
+
Local install:
|
| 114 |
+
|
| 115 |
+
```bash
|
| 116 |
+
python -m pip install -e .
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
Install optional adversarial self-play training stack (TRL + Transformers):
|
| 120 |
+
|
| 121 |
+
```bash
|
| 122 |
+
python -m pip install -e ".[train]"
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
Run tests:
|
| 126 |
+
|
| 127 |
+
```bash
|
| 128 |
+
python -m pytest -q
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
Run the automated release gate:
|
| 132 |
+
|
| 133 |
+
```bash
|
| 134 |
+
python scripts/validate_release.py
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
## Usage
|
| 138 |
+
|
| 139 |
+
Run one demo episode:
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
osint-env demo --agent-mode swarm --llm-provider mock
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
Run against MetaQA using the provided sample config:
|
| 146 |
+
|
| 147 |
+
```bash
|
| 148 |
+
osint-env demo --config config/shared_config_metaqa.json --dataset-mode metaqa --llm-provider mock
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
Run MetaQA with only selected hop buckets:
|
| 152 |
+
|
| 153 |
+
```bash
|
| 154 |
+
osint-env eval --config config/shared_config_metaqa.json --dataset-mode metaqa --metaqa-hops 1-hop,2-hop --episodes 5 --llm-provider mock
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
Run a quick evaluation:
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
osint-env eval --episodes 5 --agent-mode swarm --llm-provider mock
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
Export a dashboard:
|
| 164 |
+
|
| 165 |
+
```bash
|
| 166 |
+
osint-env benchmark --episodes 5 --agent-mode swarm --llm-provider mock --name quick_check
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
Run Kimi-style adversarial self-play scaffold (dry-run by default in the example config):
|
| 170 |
+
|
| 171 |
+
```bash
|
| 172 |
+
osint-env train-self-play --config config/shared_config.json --train-config config/self_play_training_example.json --dry-run
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
When you have compute and the train dependencies installed, remove `--dry-run` (or set `"dry_run": false` in the training config) to execute TRL GRPO updates for alternating generator and answerer phases.
|
| 176 |
+
|
| 177 |
+
The training config also supports `"model_topology": "dual"|"shared"`, `"phase_schedule": "generator_answerer"|"answerer_generator_answerer"`, `"tuning_mode": "full"|"lora"`, and `"canonical_graph_mode": "generate"|"fixed"` so you can switch between two-model vs single-model self-play, full fine-tuning vs LoRA adapters, and whether canonical graph structure is generated each round or kept fixed while training question/answer behavior.
|
| 178 |
+
|
| 179 |
+
### Hugging Face Space Smoke Run (Qwen 3.5 0.8B + W&B)
|
| 180 |
+
|
| 181 |
+
For a short verification run (enough to confirm W&B logging before scaling up), use:
|
| 182 |
+
|
| 183 |
+
```bash
|
| 184 |
+
osint-env train-self-play --config config/shared_config.json --train-config config/self_play_training_hf_a10g_smoke.json
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
This config:
|
| 188 |
+
|
| 189 |
+
- uses `Qwen/Qwen3.5-0.8B`
|
| 190 |
+
- enables W&B reporting (`wandb_enabled: true`)
|
| 191 |
+
- uses `pipeline_mode: "swarm_v2"` with `canonical_graph_mode: "fixed"` to keep canonical graph candidates stable while training question/answer behavior
|
| 192 |
+
- keeps training intentionally short (`rounds=1`, `max_steps=5` per phase)
|
| 193 |
+
- uses LoRA with small batch settings so it can run as a smoke test on an A10G
|
| 194 |
+
|
| 195 |
+
To enable canonical graph generation during swarm_v2 training, switch `"canonical_graph_mode"` to `"generate"` in the training config.
|
| 196 |
+
|
| 197 |
+
Space setup checklist:
|
| 198 |
+
|
| 199 |
+
1. In Space **Settings -> Hardware**, select **NVIDIA A10G (large)**.
|
| 200 |
+
2. In Space **Settings -> Variables and secrets**, set `WANDB_API_KEY`.
|
| 201 |
+
3. Set `HF_TOKEN` in Space secrets to avoid unauthenticated Hub downloads and stricter rate limits.
|
| 202 |
+
4. Optionally set `WANDB_ENTITY` if your project belongs to a team.
|
| 203 |
+
5. Set `RUN_SELF_PLAY_TRAINING=1` in Space variables to trigger training during container startup.
|
| 204 |
+
6. Optional overrides:
|
| 205 |
+
- `TRAIN_SELF_PLAY_CONFIG_PATH` (default: `config/self_play_training_hf_a10g_smoke.json`)
|
| 206 |
+
- `TRAIN_ENV_CONFIG_PATH` (default: `config/shared_config.json`)
|
| 207 |
+
- `RUN_SELF_PLAY_DRY_RUN=1` to test startup wiring without GRPO updates.
|
| 208 |
+
- `OSINT_TRAIN_STRICT_ASSERTS=1` to fail fast when reward variance, KL, loss, grad norms, or parameter updates stay zero.
|
| 209 |
+
7. Restart the Space and monitor build/runtime logs for the training run.
|
| 210 |
+
|
| 211 |
+
W&B run naming is controlled by `wandb_run_name_prefix` and will emit phase-specific runs like `...-r001-generator` and `...-r001-answerer`.
|
| 212 |
+
|
| 213 |
+
### Reward Functions In Self-Play (Generator + Answerer)
|
| 214 |
+
|
| 215 |
+
Self-play trains two policies with role-specific reward functions defined in `src/osint_env/training/rewards.py`.
|
| 216 |
+
|
| 217 |
+
Generator reward (`GeneratorRewardFunction`) and answerer reward (`AnswererRewardFunction`) are both returned to GRPO as scalar scores per completion, and both are clipped to a stable range before optimization.
|
| 218 |
+
|
| 219 |
+
#### Generator Reward (Task-Proposing Agent)
|
| 220 |
+
|
| 221 |
+
The generator is rewarded for producing valid, grounded, diverse, and hard tasks.
|
| 222 |
+
|
| 223 |
+
In `legacy` pipeline mode, the reward is a weighted sum:
|
| 224 |
+
|
| 225 |
+
- `validity`: checks non-empty `question`, non-empty `answer`, and bounded `supporting_edges`.
|
| 226 |
+
- `hardness`: uses a frozen answerer judge; reward is higher when the judge gets the generated question wrong.
|
| 227 |
+
- `diversity`: penalizes near-duplicate questions via token-level Jaccard similarity against prior generated questions.
|
| 228 |
+
- `consistency`: checks that support edges exist in the canonical graph and that the answer/question are graph-grounded.
|
| 229 |
+
|
| 230 |
+
Default weights (configurable through `generator_reward_weights` in training config):
|
| 231 |
+
|
| 232 |
+
- `validity`: `0.35`
|
| 233 |
+
- `hardness`: `0.45`
|
| 234 |
+
- `diversity`: `0.10`
|
| 235 |
+
- `consistency`: `0.10`
|
| 236 |
+
|
| 237 |
+
In `swarm_v2` pipeline mode, generation uses strict replay/validation first, then a structured reward:
|
| 238 |
+
|
| 239 |
+
- Hard-gated validation via `SwarmV2ReplayValidator` (invalid samples get a fixed negative reward path).
|
| 240 |
+
- Reward components include validity, derivability/replayability, hardness, swarm diversity, shared-context pressure targeting, and PARL-inspired orchestration bonuses (`parallel` + `finish`).
|
| 241 |
+
- Invalid or non-replayable candidates are penalized before the weighted positive terms are applied.
|
| 242 |
+
|
| 243 |
+
#### Answerer Reward (Question-Solving Agent)
|
| 244 |
+
|
| 245 |
+
The answerer reward wraps environment-native grading so train-time behavior matches benchmark-time incentives.
|
| 246 |
+
|
| 247 |
+
For each completion, `AnswererRewardFunction`:
|
| 248 |
+
|
| 249 |
+
- extracts the predicted answer from completion text/JSON,
|
| 250 |
+
- reconstructs a transient `TaskInstance` from row fields (`question`, `answer`, `supporting_edges_json`, `difficulty`),
|
| 251 |
+
- optionally extracts predicted supporting edges from JSON or text,
|
| 252 |
+
- calls `compute_answer_reward(...)` from `src/osint_env/env/reward.py`.
|
| 253 |
+
|
| 254 |
+
`compute_answer_reward` combines exact answer quality with graph-utility shaping:
|
| 255 |
+
|
| 256 |
+
- output format validity and exact correctness,
|
| 257 |
+
- knowledge-carrier and knowledge-indexing utility,
|
| 258 |
+
- connectivity and supporting-edge F1 against task support edges,
|
| 259 |
+
- efficiency and compactness penalties,
|
| 260 |
+
- relation/entity informativeness and repetition control (difficulty-dependent).
|
| 261 |
+
|
| 262 |
+
Difficulty controls (`easy`, `medium`, `hard`) are preserved during training exactly as in the environment scorer, so the answerer sees the same tiered reward profile used in evaluation.
|
| 263 |
+
|
| 264 |
+
In `swarm_v2`, the answerer reward also adds PARL-style orchestration credit (spawn/finish behavior) on top of base answer reward when orchestrator telemetry is present in the completion payload.
|
| 265 |
+
|
| 266 |
+
Detailed design notes are in `docs/adversarial_self_play.md`.
|
| 267 |
+
|
| 268 |
+
## OpenAI Baseline
|
| 269 |
+
|
| 270 |
+
The reproducible OpenAI baseline is implemented in `scripts/run_openai_baseline.py`. It runs on the fixed-level benchmark, uses a stable seeded graph/task set, writes a JSON artifact, appends a leaderboard record, and exports a dashboard.
|
| 271 |
+
|
| 272 |
+
Default behavior:
|
| 273 |
+
|
| 274 |
+
- dataset: fixed-level benchmark
|
| 275 |
+
- episodes: 30
|
| 276 |
+
- max steps per episode: 8
|
| 277 |
+
- temperature: 0.0
|
| 278 |
+
- output artifact: `artifacts/baselines/openai_fixed_levels_latest.json`
|
| 279 |
+
|
| 280 |
+
Run it with an API key:
|
| 281 |
+
|
| 282 |
+
```bash
|
| 283 |
+
export OPENAI_API_KEY="your_key_here"
|
| 284 |
+
python scripts/run_openai_baseline.py --model gpt-5-nano
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
The script is designed to stay bounded enough for a normal benchmark pass to finish comfortably under 20 minutes on a lightweight chat model, while still using the full fixed task set. For repeatability it fixes the benchmark graph/tasks and uses deterministic decoding settings. Because remote model backends can still change over time, the output artifact also records model metadata and system fingerprints when available.
|
| 288 |
+
|
| 289 |
+
## Inference Script
|
| 290 |
+
|
| 291 |
+
The submission-ready inference entrypoint is the root `inference.py` file. It talks to the deployed Hugging Face Space over HTTP, uses the OpenAI client for all model calls, and emits structured stdout logs in the `[START]`, `[STEP]`, and `[END]` format.
|
| 292 |
+
|
| 293 |
+
The script accepts `HF_TOKEN` as the primary auth variable and also supports `OPENAI_API_KEY` or `API_KEY` as local fallbacks.
|
| 294 |
+
After a successful run, `inference.py` also posts the evaluation summary back to the Space so the latest `/dashboard` view reflects that run.
|
| 295 |
+
|
| 296 |
+
Required environment variables:
|
| 297 |
+
|
| 298 |
+
- `API_BASE_URL`
|
| 299 |
+
- `MODEL_NAME`
|
| 300 |
+
- `HF_TOKEN`
|
| 301 |
+
|
| 302 |
+
Optional environment variables:
|
| 303 |
+
|
| 304 |
+
- `SPACE_URL` default: `https://siddeshwar1625-osint.hf.space`
|
| 305 |
+
- `TASK_INDICES` default: `0,10,20`
|
| 306 |
+
- `MAX_STEPS` default: `8`
|
| 307 |
+
|
| 308 |
+
Example local test command against a running local server:
|
| 309 |
+
|
| 310 |
+
```bash
|
| 311 |
+
API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-5.4-mini OPENAI_API_KEY=your_key SPACE_URL=http://127.0.0.1:7860 python inference.py
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
Example test command against the deployed Space:
|
| 315 |
+
|
| 316 |
+
```bash
|
| 317 |
+
API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-5.4-mini OPENAI_API_KEY=your_key SPACE_URL=https://siddeshwar1625-osint.hf.space python inference.py
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
## Docker And Hugging Face Space
|
| 321 |
+
|
| 322 |
+
The repository is ready for a Docker-based Hugging Face Space:
|
| 323 |
+
|
| 324 |
+
- `README.md` includes `sdk: docker`
|
| 325 |
+
- `README.md` includes the `openenv` Space tag
|
| 326 |
+
- `Dockerfile` serves `server.py` on port `7860`
|
| 327 |
+
|
| 328 |
+
Local Docker smoke test:
|
| 329 |
+
|
| 330 |
+
```bash
|
| 331 |
+
docker build -t osint-openenv .
|
| 332 |
+
docker run --rm -p 7860:7860 osint-openenv
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
Then open `http://localhost:7860`.
|
| 336 |
+
|
| 337 |
+
The FastAPI app serves:
|
| 338 |
+
|
| 339 |
+
- `/`: overview page
|
| 340 |
+
- `/dashboard`: generated benchmark dashboard
|
| 341 |
+
- `/api/environment`: environment metadata
|
| 342 |
+
- `/health`: health check (validator-friendly alias)
|
| 343 |
+
- `/healthz`: health check (legacy alias)
|
| 344 |
+
- `/openenv.yaml`: OpenEnv HTTP spec stub
|
| 345 |
+
- `/openenv/tasks`: task enumeration
|
| 346 |
+
- `/reset` and `/openenv/reset`: episode reset endpoints
|
| 347 |
+
- `/step` and `/openenv/step`: episode step endpoints
|
| 348 |
+
- `/state` and `/openenv/state/{session_id}`: session state endpoints (`/state` returns the latest session)
|
| 349 |
+
|
| 350 |
+
## Automated Validation
|
| 351 |
+
|
| 352 |
+
The repository includes a pass/fail validation gate for the core delivery requirements:
|
| 353 |
+
|
| 354 |
+
- Hugging Face Space readiness
|
| 355 |
+
- OpenEnv spec compliance
|
| 356 |
+
- reproducible baseline behavior
|
| 357 |
+
- at least 3 fixed tasks with working graders
|
| 358 |
+
- Docker image build in CI
|
| 359 |
+
|
| 360 |
+
Local gate:
|
| 361 |
+
|
| 362 |
+
```bash
|
| 363 |
+
python scripts/validate_release.py
|
| 364 |
+
```
|
| 365 |
+
|
| 366 |
+
CI gate:
|
| 367 |
+
|
| 368 |
+
- `.github/workflows/validation.yml`
|
| 369 |
+
- runs `pytest`
|
| 370 |
+
- runs the validation script
|
| 371 |
+
- runs `docker build`
|
| 372 |
+
|
| 373 |
+
## Baseline Scores
|
| 374 |
+
|
| 375 |
+
The fixed-level benchmark was expanded from the earlier 15-question set to a 30-question set with a larger seeded graph, so older benchmark artifacts should be treated as legacy and regenerated on the new dataset before using them as reference scores.
|
| 376 |
+
|
| 377 |
+
After you supply an OpenAI API key, the current baseline scores for the expanded benchmark will be written to:
|
| 378 |
+
|
| 379 |
+
- `artifacts/baselines/openai_fixed_levels_latest.json`
|
| 380 |
+
- `artifacts/baselines/openai_fixed_levels_dashboard.html`
|
| 381 |
+
|
| 382 |
+
## Notes On `pyproject.toml`
|
| 383 |
+
|
| 384 |
+
The packaging file is structurally correct for a `src/` layout and editable installs. The main gaps were deployment/runtime related rather than build-breaking:
|
| 385 |
+
|
| 386 |
+
- `openenv` is now version-bounded explicitly.
|
| 387 |
+
- `fastapi` and `uvicorn` are included because the repo now ships a real web server.
|
| 388 |
+
- pytest is pointed at the `tests/` directory, and the test suite also adds `src/` to `sys.path` so source-layout imports work reliably during local runs.
|
| 389 |
+
|
| 390 |
+
## Development Notes
|
| 391 |
+
|
| 392 |
+
The project keeps a lightweight local compatibility shim for `openenv` so the source tree remains importable even before dependencies are installed. In a normal install or Docker build, the real `openenv` package from PyPI is still used.
|
artifacts/leaderboard.json
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"config": {
|
| 4 |
+
"max_agents": 3,
|
| 5 |
+
"max_breadth": 2,
|
| 6 |
+
"max_depth": 2,
|
| 7 |
+
"max_steps": 18,
|
| 8 |
+
"max_width": 2,
|
| 9 |
+
"seed": 7,
|
| 10 |
+
"seeded_questions": 1,
|
| 11 |
+
"swarm_enabled": true
|
| 12 |
+
},
|
| 13 |
+
"created_at": "2026-04-01T12:03:13+00:00",
|
| 14 |
+
"episodes": 2,
|
| 15 |
+
"metrics": {
|
| 16 |
+
"avg_compactness_reward": 0.0,
|
| 17 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 18 |
+
"avg_connectivity_reward": 0.3,
|
| 19 |
+
"avg_diversity_reward": 0.08,
|
| 20 |
+
"avg_entity_informativeness_reward": 0.024705877237863647,
|
| 21 |
+
"avg_format_reward": 0.15,
|
| 22 |
+
"avg_graph_f1": 1.0,
|
| 23 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 24 |
+
"avg_knowledge_indexing_reward": 0.15,
|
| 25 |
+
"avg_relation_informativeness_reward": 0.03137141693971891,
|
| 26 |
+
"avg_reward": 3.534162700533434,
|
| 27 |
+
"avg_soft_shaping_reward": 0.15,
|
| 28 |
+
"avg_spawn_count": 4.0,
|
| 29 |
+
"avg_spawn_critical_steps": 6.0,
|
| 30 |
+
"avg_steps_to_solution": 9.0,
|
| 31 |
+
"deanonymization_accuracy": 1.0,
|
| 32 |
+
"leaderboard_score": 0.8618382743087459,
|
| 33 |
+
"retrieval_signal": 0.7275,
|
| 34 |
+
"spawn_completion_rate": 1.0,
|
| 35 |
+
"spawn_signal": 0.6666666666666666,
|
| 36 |
+
"structural_signal": 0.6082154588355165,
|
| 37 |
+
"task_success_rate": 1.0,
|
| 38 |
+
"tool_efficiency": 0.25
|
| 39 |
+
},
|
| 40 |
+
"run_id": "run_0001",
|
| 41 |
+
"run_name": "swarm_seed_smoke"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"config": {
|
| 45 |
+
"max_agents": 3,
|
| 46 |
+
"max_breadth": 2,
|
| 47 |
+
"max_depth": 2,
|
| 48 |
+
"max_steps": 18,
|
| 49 |
+
"max_width": 2,
|
| 50 |
+
"seed": 7,
|
| 51 |
+
"seeded_questions": 1,
|
| 52 |
+
"swarm_enabled": true
|
| 53 |
+
},
|
| 54 |
+
"created_at": "2026-04-01T12:16:28+00:00",
|
| 55 |
+
"episodes": 2,
|
| 56 |
+
"metrics": {
|
| 57 |
+
"avg_compactness_reward": 0.0,
|
| 58 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 59 |
+
"avg_connectivity_reward": 0.3,
|
| 60 |
+
"avg_diversity_reward": 0.08,
|
| 61 |
+
"avg_entity_informativeness_reward": 0.024705877237863647,
|
| 62 |
+
"avg_format_reward": 0.15,
|
| 63 |
+
"avg_graph_f1": 1.0,
|
| 64 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 65 |
+
"avg_knowledge_indexing_reward": 0.15,
|
| 66 |
+
"avg_relation_informativeness_reward": 0.03137141693971891,
|
| 67 |
+
"avg_reward": 3.534162700533434,
|
| 68 |
+
"avg_soft_shaping_reward": 0.15,
|
| 69 |
+
"avg_spawn_count": 4.0,
|
| 70 |
+
"avg_spawn_critical_steps": 6.0,
|
| 71 |
+
"avg_steps_to_solution": 9.0,
|
| 72 |
+
"deanonymization_accuracy": 1.0,
|
| 73 |
+
"leaderboard_score": 0.8618382743087459,
|
| 74 |
+
"retrieval_signal": 0.7275,
|
| 75 |
+
"spawn_completion_rate": 1.0,
|
| 76 |
+
"spawn_signal": 0.6666666666666666,
|
| 77 |
+
"structural_signal": 0.6082154588355165,
|
| 78 |
+
"task_success_rate": 1.0,
|
| 79 |
+
"tool_efficiency": 0.25
|
| 80 |
+
},
|
| 81 |
+
"run_id": "run_0002",
|
| 82 |
+
"run_name": "swarm_seed_smoke"
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"config": {
|
| 86 |
+
"max_agents": 3,
|
| 87 |
+
"max_breadth": 2,
|
| 88 |
+
"max_depth": 2,
|
| 89 |
+
"max_steps": 18,
|
| 90 |
+
"max_width": 2,
|
| 91 |
+
"seed": 7,
|
| 92 |
+
"seeded_questions": 0,
|
| 93 |
+
"swarm_enabled": true
|
| 94 |
+
},
|
| 95 |
+
"created_at": "2026-04-01T12:25:15+00:00",
|
| 96 |
+
"episodes": 20,
|
| 97 |
+
"metrics": {
|
| 98 |
+
"avg_compactness_reward": 0.0,
|
| 99 |
+
"avg_connectivity_gain_reward": 0.10000000000000002,
|
| 100 |
+
"avg_connectivity_reward": 0.23999999999999994,
|
| 101 |
+
"avg_diversity_reward": 0.08000000000000002,
|
| 102 |
+
"avg_entity_informativeness_reward": -0.00983642442912193,
|
| 103 |
+
"avg_format_reward": 0.14999999999999997,
|
| 104 |
+
"avg_graph_f1": 1.0,
|
| 105 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 106 |
+
"avg_knowledge_indexing_reward": 0.1125,
|
| 107 |
+
"avg_relation_informativeness_reward": 0.007185245326892638,
|
| 108 |
+
"avg_reward": 3.351267560586956,
|
| 109 |
+
"avg_soft_shaping_reward": 0.14999999999999997,
|
| 110 |
+
"avg_spawn_count": 4.0,
|
| 111 |
+
"avg_spawn_critical_steps": 6.0,
|
| 112 |
+
"avg_steps_to_solution": 9.0,
|
| 113 |
+
"deanonymization_accuracy": 1.0,
|
| 114 |
+
"leaderboard_score": 0.8573187614039594,
|
| 115 |
+
"retrieval_signal": 0.7143750000000001,
|
| 116 |
+
"spawn_completion_rate": 1.0,
|
| 117 |
+
"spawn_signal": 0.6666666666666666,
|
| 118 |
+
"structural_signal": 0.5814697641795541,
|
| 119 |
+
"task_success_rate": 1.0,
|
| 120 |
+
"tool_efficiency": 0.25
|
| 121 |
+
},
|
| 122 |
+
"run_id": "run_0003",
|
| 123 |
+
"run_name": "baseline_swarm"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"config": {
|
| 127 |
+
"max_agents": 3,
|
| 128 |
+
"max_breadth": 2,
|
| 129 |
+
"max_depth": 2,
|
| 130 |
+
"max_steps": 18,
|
| 131 |
+
"max_width": 2,
|
| 132 |
+
"seed": 7,
|
| 133 |
+
"seeded_questions": 1,
|
| 134 |
+
"swarm_enabled": true
|
| 135 |
+
},
|
| 136 |
+
"created_at": "2026-04-01T17:27:30+00:00",
|
| 137 |
+
"episodes": 1,
|
| 138 |
+
"metrics": {
|
| 139 |
+
"avg_compactness_reward": 0.0,
|
| 140 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 141 |
+
"avg_connectivity_reward": 0.3,
|
| 142 |
+
"avg_diversity_reward": 0.08,
|
| 143 |
+
"avg_entity_informativeness_reward": 0.06128386989162576,
|
| 144 |
+
"avg_format_reward": 0.15,
|
| 145 |
+
"avg_graph_f1": 1.0,
|
| 146 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 147 |
+
"avg_knowledge_indexing_reward": 0.3,
|
| 148 |
+
"avg_relation_informativeness_reward": 0.12,
|
| 149 |
+
"avg_reward": 3.916035942914144,
|
| 150 |
+
"avg_soft_shaping_reward": 0.15,
|
| 151 |
+
"avg_spawn_count": 4.0,
|
| 152 |
+
"avg_spawn_critical_steps": 6.0,
|
| 153 |
+
"avg_steps_to_solution": 9.0,
|
| 154 |
+
"deanonymization_accuracy": 1.0,
|
| 155 |
+
"leaderboard_score": 0.8718832338515622,
|
| 156 |
+
"retrieval_signal": 0.78,
|
| 157 |
+
"spawn_completion_rate": 1.0,
|
| 158 |
+
"spawn_signal": 0.6666666666666666,
|
| 159 |
+
"structural_signal": 0.6332567739783251,
|
| 160 |
+
"task_success_rate": 1.0,
|
| 161 |
+
"tool_efficiency": 0.25
|
| 162 |
+
},
|
| 163 |
+
"run_id": "run_0004",
|
| 164 |
+
"run_name": "ollama_qwen_smoke"
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"config": {
|
| 168 |
+
"max_agents": 3,
|
| 169 |
+
"max_breadth": 2,
|
| 170 |
+
"max_depth": 2,
|
| 171 |
+
"max_steps": 18,
|
| 172 |
+
"max_width": 2,
|
| 173 |
+
"seed": 7,
|
| 174 |
+
"seeded_questions": 1,
|
| 175 |
+
"swarm_enabled": true
|
| 176 |
+
},
|
| 177 |
+
"created_at": "2026-04-01T17:29:12+00:00",
|
| 178 |
+
"episodes": 1,
|
| 179 |
+
"metrics": {
|
| 180 |
+
"avg_compactness_reward": 0.0,
|
| 181 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 182 |
+
"avg_connectivity_reward": 0.3,
|
| 183 |
+
"avg_diversity_reward": 0.08,
|
| 184 |
+
"avg_entity_informativeness_reward": 0.06128386989162576,
|
| 185 |
+
"avg_format_reward": 0.15,
|
| 186 |
+
"avg_graph_f1": 1.0,
|
| 187 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 188 |
+
"avg_knowledge_indexing_reward": 0.3,
|
| 189 |
+
"avg_relation_informativeness_reward": 0.12,
|
| 190 |
+
"avg_reward": 4.059369276247478,
|
| 191 |
+
"avg_soft_shaping_reward": 0.15,
|
| 192 |
+
"avg_spawn_count": 4.0,
|
| 193 |
+
"avg_spawn_critical_steps": 6.0,
|
| 194 |
+
"avg_steps_to_solution": 9.0,
|
| 195 |
+
"deanonymization_accuracy": 1.0,
|
| 196 |
+
"leaderboard_score": 0.9020114237119466,
|
| 197 |
+
"retrieval_signal": 0.78,
|
| 198 |
+
"spawn_completion_rate": 1.0,
|
| 199 |
+
"spawn_signal": 0.6666666666666666,
|
| 200 |
+
"structural_signal": 0.6332567739783251,
|
| 201 |
+
"task_success_rate": 1.0,
|
| 202 |
+
"tool_efficiency": 0.5
|
| 203 |
+
},
|
| 204 |
+
"run_id": "run_0005",
|
| 205 |
+
"run_name": "ollama_qwen_smoke2"
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"config": {
|
| 209 |
+
"max_agents": 3,
|
| 210 |
+
"max_breadth": 2,
|
| 211 |
+
"max_depth": 2,
|
| 212 |
+
"max_steps": 18,
|
| 213 |
+
"max_width": 2,
|
| 214 |
+
"seed": 7,
|
| 215 |
+
"seeded_questions": 0,
|
| 216 |
+
"swarm_enabled": true
|
| 217 |
+
},
|
| 218 |
+
"created_at": "2026-04-01T17:39:15+00:00",
|
| 219 |
+
"episodes": 2,
|
| 220 |
+
"metrics": {
|
| 221 |
+
"avg_compactness_reward": 0.0,
|
| 222 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 223 |
+
"avg_connectivity_reward": 0.0,
|
| 224 |
+
"avg_diversity_reward": 0.0683333333333333,
|
| 225 |
+
"avg_entity_informativeness_reward": -0.07397348480982455,
|
| 226 |
+
"avg_format_reward": 0.15,
|
| 227 |
+
"avg_graph_f1": 0.6666666666666667,
|
| 228 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 229 |
+
"avg_knowledge_indexing_reward": 0.14884615384615385,
|
| 230 |
+
"avg_relation_informativeness_reward": -0.00860389783205907,
|
| 231 |
+
"avg_reward": 4.351764433970379,
|
| 232 |
+
"avg_soft_shaping_reward": 0.3,
|
| 233 |
+
"avg_spawn_count": 4.0,
|
| 234 |
+
"avg_spawn_critical_steps": 6.0,
|
| 235 |
+
"avg_steps_to_solution": 9.0,
|
| 236 |
+
"deanonymization_accuracy": 0.0,
|
| 237 |
+
"leaderboard_score": 0.6973935600514568,
|
| 238 |
+
"retrieval_signal": 0.7270961538461539,
|
| 239 |
+
"spawn_completion_rate": 1.0,
|
| 240 |
+
"spawn_signal": 0.6666666666666666,
|
| 241 |
+
"structural_signal": 0.5137345234716233,
|
| 242 |
+
"task_success_rate": 1.0,
|
| 243 |
+
"tool_efficiency": 0.5
|
| 244 |
+
},
|
| 245 |
+
"run_id": "run_0006",
|
| 246 |
+
"run_name": "high_timeout_shared_ctx"
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"config": {
|
| 250 |
+
"max_agents": 3,
|
| 251 |
+
"max_breadth": 2,
|
| 252 |
+
"max_depth": 2,
|
| 253 |
+
"max_steps": 18,
|
| 254 |
+
"max_width": 2,
|
| 255 |
+
"seed": 7,
|
| 256 |
+
"seeded_questions": 0,
|
| 257 |
+
"swarm_enabled": true
|
| 258 |
+
},
|
| 259 |
+
"created_at": "2026-04-01T18:57:40+00:00",
|
| 260 |
+
"episodes": 3,
|
| 261 |
+
"metrics": {
|
| 262 |
+
"avg_compactness_reward": 0.0,
|
| 263 |
+
"avg_connectivity_gain_reward": 0.13333333333333333,
|
| 264 |
+
"avg_connectivity_reward": 0.09999999999999999,
|
| 265 |
+
"avg_diversity_reward": 0.056666666666666664,
|
| 266 |
+
"avg_entity_informativeness_reward": -0.020478979694240708,
|
| 267 |
+
"avg_format_reward": 0.15,
|
| 268 |
+
"avg_graph_f1": 0.8148148148148149,
|
| 269 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 270 |
+
"avg_knowledge_indexing_reward": 0.27,
|
| 271 |
+
"avg_relation_informativeness_reward": 0.07174291752145656,
|
| 272 |
+
"avg_reward": 4.0269419367756605,
|
| 273 |
+
"avg_soft_shaping_reward": 0.19999999999999998,
|
| 274 |
+
"avg_spawn_count": 4.0,
|
| 275 |
+
"avg_spawn_critical_steps": 6.0,
|
| 276 |
+
"avg_steps_to_solution": 9.0,
|
| 277 |
+
"deanonymization_accuracy": 0.0,
|
| 278 |
+
"leaderboard_score": 0.7366215569569294,
|
| 279 |
+
"retrieval_signal": 0.7695000000000001,
|
| 280 |
+
"spawn_completion_rate": 1.0,
|
| 281 |
+
"spawn_signal": 0.6666666666666666,
|
| 282 |
+
"structural_signal": 0.5570861208987765,
|
| 283 |
+
"task_success_rate": 1.0,
|
| 284 |
+
"tool_efficiency": 0.5
|
| 285 |
+
},
|
| 286 |
+
"run_id": "run_0007",
|
| 287 |
+
"run_name": "episode_selector_check"
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
"config": {
|
| 291 |
+
"max_agents": 3,
|
| 292 |
+
"max_breadth": 2,
|
| 293 |
+
"max_depth": 2,
|
| 294 |
+
"max_steps": 18,
|
| 295 |
+
"max_width": 2,
|
| 296 |
+
"seed": 7,
|
| 297 |
+
"seeded_questions": 15,
|
| 298 |
+
"swarm_enabled": true
|
| 299 |
+
},
|
| 300 |
+
"created_at": "2026-04-01T19:11:44+00:00",
|
| 301 |
+
"episodes": 3,
|
| 302 |
+
"metrics": {
|
| 303 |
+
"avg_compactness_reward": 0.0,
|
| 304 |
+
"avg_connectivity_gain_reward": 0.10000000000000002,
|
| 305 |
+
"avg_connectivity_reward": 0.3,
|
| 306 |
+
"avg_diversity_reward": 0.08,
|
| 307 |
+
"avg_entity_informativeness_reward": -0.02722031691758704,
|
| 308 |
+
"avg_format_reward": 0.15,
|
| 309 |
+
"avg_graph_f1": 1.0,
|
| 310 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 311 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 312 |
+
"avg_relation_informativeness_reward": -0.00011920119799207429,
|
| 313 |
+
"avg_reward": 3.444079221573606,
|
| 314 |
+
"avg_soft_shaping_reward": 0.15,
|
| 315 |
+
"avg_spawn_count": 4.0,
|
| 316 |
+
"avg_spawn_critical_steps": 6.0,
|
| 317 |
+
"avg_steps_to_solution": 9.0,
|
| 318 |
+
"deanonymization_accuracy": 1.0,
|
| 319 |
+
"leaderboard_score": 0.8828572592896698,
|
| 320 |
+
"retrieval_signal": 0.675,
|
| 321 |
+
"spawn_completion_rate": 1.0,
|
| 322 |
+
"spawn_signal": 0.6666666666666666,
|
| 323 |
+
"structural_signal": 0.5915320963768841,
|
| 324 |
+
"task_success_rate": 1.0,
|
| 325 |
+
"tool_efficiency": 0.5
|
| 326 |
+
},
|
| 327 |
+
"run_id": "run_0008",
|
| 328 |
+
"run_name": "qwen_rerun"
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"config": {
|
| 332 |
+
"max_agents": 3,
|
| 333 |
+
"max_breadth": 2,
|
| 334 |
+
"max_depth": 2,
|
| 335 |
+
"max_steps": 18,
|
| 336 |
+
"max_width": 2,
|
| 337 |
+
"seed": 7,
|
| 338 |
+
"seeded_questions": 15,
|
| 339 |
+
"swarm_enabled": true
|
| 340 |
+
},
|
| 341 |
+
"created_at": "2026-04-01T19:19:34+00:00",
|
| 342 |
+
"episodes": 3,
|
| 343 |
+
"metrics": {
|
| 344 |
+
"avg_compactness_reward": 0.0,
|
| 345 |
+
"avg_connectivity_gain_reward": 0.10000000000000002,
|
| 346 |
+
"avg_connectivity_reward": 0.3,
|
| 347 |
+
"avg_diversity_reward": 0.08,
|
| 348 |
+
"avg_entity_informativeness_reward": -0.024861029515896544,
|
| 349 |
+
"avg_format_reward": 0.15,
|
| 350 |
+
"avg_graph_f1": 1.0,
|
| 351 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 352 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 353 |
+
"avg_relation_informativeness_reward": -0.0024320085090966614,
|
| 354 |
+
"avg_reward": 3.4441257016641917,
|
| 355 |
+
"avg_soft_shaping_reward": 0.15,
|
| 356 |
+
"avg_spawn_count": 4.0,
|
| 357 |
+
"avg_spawn_critical_steps": 6.0,
|
| 358 |
+
"avg_steps_to_solution": 9.0,
|
| 359 |
+
"deanonymization_accuracy": 1.0,
|
| 360 |
+
"leaderboard_score": 0.8828581656226586,
|
| 361 |
+
"retrieval_signal": 0.675,
|
| 362 |
+
"spawn_completion_rate": 1.0,
|
| 363 |
+
"spawn_signal": 0.6666666666666666,
|
| 364 |
+
"structural_signal": 0.5915413923950014,
|
| 365 |
+
"task_success_rate": 1.0,
|
| 366 |
+
"tool_efficiency": 0.5
|
| 367 |
+
},
|
| 368 |
+
"run_id": "run_0009",
|
| 369 |
+
"run_name": "qwen_episode_fix"
|
| 370 |
+
},
|
| 371 |
+
{
|
| 372 |
+
"config": {
|
| 373 |
+
"max_agents": 3,
|
| 374 |
+
"max_breadth": 2,
|
| 375 |
+
"max_depth": 2,
|
| 376 |
+
"max_steps": 18,
|
| 377 |
+
"max_width": 2,
|
| 378 |
+
"seed": 7,
|
| 379 |
+
"seeded_questions": 15,
|
| 380 |
+
"swarm_enabled": true
|
| 381 |
+
},
|
| 382 |
+
"created_at": "2026-04-01T19:24:37+00:00",
|
| 383 |
+
"episodes": 3,
|
| 384 |
+
"metrics": {
|
| 385 |
+
"avg_compactness_reward": 0.0,
|
| 386 |
+
"avg_connectivity_gain_reward": 0.10000000000000002,
|
| 387 |
+
"avg_connectivity_reward": 0.3,
|
| 388 |
+
"avg_diversity_reward": 0.08,
|
| 389 |
+
"avg_entity_informativeness_reward": -0.02722031691758704,
|
| 390 |
+
"avg_format_reward": 0.15,
|
| 391 |
+
"avg_graph_f1": 1.0,
|
| 392 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 393 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 394 |
+
"avg_relation_informativeness_reward": -0.0030604289114462002,
|
| 395 |
+
"avg_reward": 3.4411379938601514,
|
| 396 |
+
"avg_soft_shaping_reward": 0.15,
|
| 397 |
+
"avg_spawn_count": 4.0,
|
| 398 |
+
"avg_spawn_critical_steps": 6.0,
|
| 399 |
+
"avg_steps_to_solution": 9.0,
|
| 400 |
+
"deanonymization_accuracy": 1.0,
|
| 401 |
+
"leaderboard_score": 0.8827999009847504,
|
| 402 |
+
"retrieval_signal": 0.675,
|
| 403 |
+
"spawn_completion_rate": 1.0,
|
| 404 |
+
"spawn_signal": 0.6666666666666666,
|
| 405 |
+
"structural_signal": 0.5909438508341933,
|
| 406 |
+
"task_success_rate": 1.0,
|
| 407 |
+
"tool_efficiency": 0.5
|
| 408 |
+
},
|
| 409 |
+
"run_id": "run_0010",
|
| 410 |
+
"run_name": "qwen_rerun_graph_fix"
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"config": {
|
| 414 |
+
"max_agents": 3,
|
| 415 |
+
"max_breadth": 2,
|
| 416 |
+
"max_depth": 2,
|
| 417 |
+
"max_steps": 18,
|
| 418 |
+
"max_width": 2,
|
| 419 |
+
"seed": 7,
|
| 420 |
+
"seeded_questions": 15,
|
| 421 |
+
"swarm_enabled": true
|
| 422 |
+
},
|
| 423 |
+
"created_at": "2026-04-01T19:31:54+00:00",
|
| 424 |
+
"episodes": 15,
|
| 425 |
+
"metrics": {
|
| 426 |
+
"avg_compactness_reward": 0.0,
|
| 427 |
+
"avg_connectivity_gain_reward": 0.16666666666666666,
|
| 428 |
+
"avg_connectivity_reward": 0.16999999999999998,
|
| 429 |
+
"avg_diversity_reward": 0.1157777777777778,
|
| 430 |
+
"avg_entity_informativeness_reward": -0.0181244777358718,
|
| 431 |
+
"avg_format_reward": 0.14999999999999997,
|
| 432 |
+
"avg_graph_f1": 0.8492063492063492,
|
| 433 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 434 |
+
"avg_knowledge_indexing_reward": 0.012000000000000002,
|
| 435 |
+
"avg_relation_informativeness_reward": 0.05935837081627929,
|
| 436 |
+
"avg_reward": 4.201760569277529,
|
| 437 |
+
"avg_soft_shaping_reward": 0.24999999999999994,
|
| 438 |
+
"avg_spawn_count": 4.0,
|
| 439 |
+
"avg_spawn_critical_steps": 6.0,
|
| 440 |
+
"avg_steps_to_solution": 9.0,
|
| 441 |
+
"deanonymization_accuracy": 1.0,
|
| 442 |
+
"leaderboard_score": 0.8534887252258901,
|
| 443 |
+
"retrieval_signal": 0.6792,
|
| 444 |
+
"spawn_completion_rate": 1.0,
|
| 445 |
+
"spawn_signal": 0.6666666666666666,
|
| 446 |
+
"structural_signal": 0.5847801119494148,
|
| 447 |
+
"task_success_rate": 1.0,
|
| 448 |
+
"tool_efficiency": 0.5
|
| 449 |
+
},
|
| 450 |
+
"run_id": "run_0011",
|
| 451 |
+
"run_name": "qwen_rerun_graph_fix"
|
| 452 |
+
}
|
| 453 |
+
]
|
config/seed_example.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seeding": {
|
| 3 |
+
"seeded_nodes": [
|
| 4 |
+
{
|
| 5 |
+
"node_id": "alias_seed_001",
|
| 6 |
+
"node_type": "alias",
|
| 7 |
+
"attrs": {
|
| 8 |
+
"handle": "@shadow_seed"
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"node_id": "user_seed_001",
|
| 13 |
+
"node_type": "user",
|
| 14 |
+
"attrs": {
|
| 15 |
+
"name": "Seed User",
|
| 16 |
+
"org": "Helios Labs",
|
| 17 |
+
"location": "Pune"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
],
|
| 21 |
+
"_note": "Use with --seed-file. LLM provider and API keys are configured in config/shared_config.json or CLI flags.",
|
| 22 |
+
"seeded_edges": [
|
| 23 |
+
{
|
| 24 |
+
"src": "alias_seed_001",
|
| 25 |
+
"rel": "alias_of",
|
| 26 |
+
"dst": "user_seed_001",
|
| 27 |
+
"confidence": 1.0
|
| 28 |
+
}
|
| 29 |
+
],
|
| 30 |
+
"seeded_questions": [
|
| 31 |
+
{
|
| 32 |
+
"task_type": "identity_resolution",
|
| 33 |
+
"question": "Which canonical user owns alias alias_seed_001?",
|
| 34 |
+
"answer": "user_seed_001",
|
| 35 |
+
"supporting_edges": [
|
| 36 |
+
{
|
| 37 |
+
"src": "alias_seed_001",
|
| 38 |
+
"rel": "alias_of",
|
| 39 |
+
"dst": "user_seed_001"
|
| 40 |
+
}
|
| 41 |
+
],
|
| 42 |
+
"metadata": {
|
| 43 |
+
"source": "manual_seed"
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
],
|
| 47 |
+
"llm_generate_remaining_graph": true,
|
| 48 |
+
"llm_generate_remaining_tasks": true,
|
| 49 |
+
"llm_generated_edge_budget": 6,
|
| 50 |
+
"llm_generated_task_budget": 8
|
| 51 |
+
}
|
| 52 |
+
}
|
config/seed_ollama_smoke.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seeding": {
|
| 3 |
+
"seeded_nodes": [
|
| 4 |
+
{
|
| 5 |
+
"node_id": "alias_smoke_001",
|
| 6 |
+
"node_type": "alias",
|
| 7 |
+
"attrs": {
|
| 8 |
+
"handle": "@smoke_alias"
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"node_id": "user_smoke_001",
|
| 13 |
+
"node_type": "user",
|
| 14 |
+
"attrs": {
|
| 15 |
+
"name": "Smoke User",
|
| 16 |
+
"org": "Apex Dynamics",
|
| 17 |
+
"location": "Bengaluru"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
],
|
| 21 |
+
"seeded_edges": [
|
| 22 |
+
{
|
| 23 |
+
"src": "alias_smoke_001",
|
| 24 |
+
"rel": "alias_of",
|
| 25 |
+
"dst": "user_smoke_001",
|
| 26 |
+
"confidence": 1.0
|
| 27 |
+
}
|
| 28 |
+
],
|
| 29 |
+
"seeded_questions": [
|
| 30 |
+
{
|
| 31 |
+
"task_type": "identity_resolution",
|
| 32 |
+
"question": "Which canonical user owns alias alias_smoke_001?",
|
| 33 |
+
"answer": "user_smoke_001",
|
| 34 |
+
"supporting_edges": [
|
| 35 |
+
{
|
| 36 |
+
"src": "alias_smoke_001",
|
| 37 |
+
"rel": "alias_of",
|
| 38 |
+
"dst": "user_smoke_001"
|
| 39 |
+
}
|
| 40 |
+
],
|
| 41 |
+
"metadata": {
|
| 42 |
+
"source": "ollama_smoke"
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"llm_generate_remaining_graph": false,
|
| 47 |
+
"llm_generate_remaining_tasks": false,
|
| 48 |
+
"llm_generated_edge_budget": 0,
|
| 49 |
+
"llm_generated_task_budget": 0
|
| 50 |
+
}
|
| 51 |
+
}
|
config/self_play_training_example.json
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"rounds": 3,
|
| 3 |
+
"output_dir": "artifacts/self_play",
|
| 4 |
+
"dry_run": true,
|
| 5 |
+
"canonical_graph_mode": "generate",
|
| 6 |
+
"pipeline_mode": "swarm_v2",
|
| 7 |
+
"model_topology": "dual",
|
| 8 |
+
"phase_schedule": "generator_answerer",
|
| 9 |
+
"tuning_mode": "full",
|
| 10 |
+
"shared_model_name_or_path": "",
|
| 11 |
+
"seed_tasks_per_round": 16,
|
| 12 |
+
"generated_tasks_per_round": 24,
|
| 13 |
+
"generator_prompts_per_round": 24,
|
| 14 |
+
"max_graph_context_nodes": 100,
|
| 15 |
+
"max_graph_context_edges": 100,
|
| 16 |
+
"max_support_edges": 8,
|
| 17 |
+
"answerer_judge_max_new_tokens": 48,
|
| 18 |
+
"generator_reward_weights": {
|
| 19 |
+
"validity": 0.35,
|
| 20 |
+
"hardness": 0.45,
|
| 21 |
+
"diversity": 0.1,
|
| 22 |
+
"consistency": 0.1
|
| 23 |
+
},
|
| 24 |
+
"lora": {
|
| 25 |
+
"r": 16,
|
| 26 |
+
"alpha": 32,
|
| 27 |
+
"dropout": 0.05,
|
| 28 |
+
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
|
| 29 |
+
"bias": "none",
|
| 30 |
+
"task_type": "CAUSAL_LM"
|
| 31 |
+
},
|
| 32 |
+
"swarm_v2": {
|
| 33 |
+
"generator_swarm": {
|
| 34 |
+
"shared_context": true,
|
| 35 |
+
"max_agents": 4,
|
| 36 |
+
"max_breadth": 3,
|
| 37 |
+
"max_depth": 2,
|
| 38 |
+
"planner_rounds": 2,
|
| 39 |
+
"tools_per_agent": 2
|
| 40 |
+
},
|
| 41 |
+
"answerer_swarm": {
|
| 42 |
+
"shared_context": true,
|
| 43 |
+
"max_agents": 3,
|
| 44 |
+
"max_breadth": 2,
|
| 45 |
+
"max_depth": 2,
|
| 46 |
+
"planner_rounds": 2,
|
| 47 |
+
"tools_per_agent": 2
|
| 48 |
+
},
|
| 49 |
+
"validation": {
|
| 50 |
+
"max_support_edges": 8,
|
| 51 |
+
"max_path_hops": 4,
|
| 52 |
+
"max_context_nodes": 14,
|
| 53 |
+
"max_context_edges": 8,
|
| 54 |
+
"duplicate_similarity_threshold": 0.8
|
| 55 |
+
},
|
| 56 |
+
"shared_context": {
|
| 57 |
+
"shared_by_default": true,
|
| 58 |
+
"max_nodes": 14,
|
| 59 |
+
"max_edges": 8,
|
| 60 |
+
"target_pressure": 0.85
|
| 61 |
+
}
|
| 62 |
+
},
|
| 63 |
+
"generator_phase": {
|
| 64 |
+
"model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
|
| 65 |
+
"learning_rate": 1e-06,
|
| 66 |
+
"max_steps": 64,
|
| 67 |
+
"per_device_train_batch_size": 2,
|
| 68 |
+
"gradient_accumulation_steps": 4,
|
| 69 |
+
"num_generations": 4,
|
| 70 |
+
"max_completion_length": 256,
|
| 71 |
+
"temperature": 1.0,
|
| 72 |
+
"top_p": 1.0,
|
| 73 |
+
"beta": 0.01,
|
| 74 |
+
"epsilon": 0.2,
|
| 75 |
+
"num_iterations": 1,
|
| 76 |
+
"loss_type": "dapo",
|
| 77 |
+
"scale_rewards": "none",
|
| 78 |
+
"logging_steps": 10,
|
| 79 |
+
"save_steps": 50,
|
| 80 |
+
"output_subdir": "generator",
|
| 81 |
+
"use_vllm": false,
|
| 82 |
+
"vllm_mode": "colocate"
|
| 83 |
+
},
|
| 84 |
+
"answerer_phase": {
|
| 85 |
+
"model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
|
| 86 |
+
"learning_rate": 1e-06,
|
| 87 |
+
"max_steps": 64,
|
| 88 |
+
"per_device_train_batch_size": 2,
|
| 89 |
+
"gradient_accumulation_steps": 4,
|
| 90 |
+
"num_generations": 4,
|
| 91 |
+
"max_completion_length": 192,
|
| 92 |
+
"temperature": 1.0,
|
| 93 |
+
"top_p": 1.0,
|
| 94 |
+
"beta": 0.01,
|
| 95 |
+
"epsilon": 0.2,
|
| 96 |
+
"num_iterations": 1,
|
| 97 |
+
"loss_type": "dapo",
|
| 98 |
+
"scale_rewards": "none",
|
| 99 |
+
"logging_steps": 10,
|
| 100 |
+
"save_steps": 50,
|
| 101 |
+
"output_subdir": "answerer",
|
| 102 |
+
"use_vllm": false,
|
| 103 |
+
"vllm_mode": "colocate"
|
| 104 |
+
}
|
| 105 |
+
}
|
config/self_play_training_hf_a10g_smoke.json
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"rounds": 2,
|
| 3 |
+
"output_dir": "artifacts/self_play_hf_a10g_train",
|
| 4 |
+
"dry_run": false,
|
| 5 |
+
"wandb_enabled": true,
|
| 6 |
+
"wandb_project": "osint-self-play-train",
|
| 7 |
+
"wandb_entity": "",
|
| 8 |
+
"wandb_run_name_prefix": "qwen35-08b-a10g-train",
|
| 9 |
+
"pipeline_mode": "swarm_v2",
|
| 10 |
+
"canonical_graph_mode": "fixed",
|
| 11 |
+
"model_topology": "shared",
|
| 12 |
+
"phase_schedule": "generator_answerer",
|
| 13 |
+
"tuning_mode": "lora",
|
| 14 |
+
"shared_model_name_or_path": "Qwen/Qwen3.5-0.8B",
|
| 15 |
+
"seed_tasks_per_round": 16,
|
| 16 |
+
"generated_tasks_per_round": 24,
|
| 17 |
+
"generator_prompts_per_round": 24,
|
| 18 |
+
"max_graph_context_nodes": 40,
|
| 19 |
+
"max_graph_context_edges": 40,
|
| 20 |
+
"max_support_edges": 6,
|
| 21 |
+
"answerer_judge_max_new_tokens": 32,
|
| 22 |
+
"generator_phase": {
|
| 23 |
+
"model_name_or_path": "Qwen/Qwen3.5-0.8B",
|
| 24 |
+
"learning_rate": 1e-06,
|
| 25 |
+
"max_steps": 50,
|
| 26 |
+
"per_device_train_batch_size": 2,
|
| 27 |
+
"gradient_accumulation_steps": 1,
|
| 28 |
+
"num_generations": 2,
|
| 29 |
+
"max_completion_length": 384,
|
| 30 |
+
"temperature": 0.7,
|
| 31 |
+
"top_p": 0.9,
|
| 32 |
+
"beta": 0.01,
|
| 33 |
+
"epsilon": 0.2,
|
| 34 |
+
"num_iterations": 1,
|
| 35 |
+
"loss_type": "dapo",
|
| 36 |
+
"scale_rewards": "group",
|
| 37 |
+
"logging_steps": 1,
|
| 38 |
+
"save_steps": 10,
|
| 39 |
+
"output_subdir": "generator_train",
|
| 40 |
+
"use_vllm": false,
|
| 41 |
+
"vllm_mode": "colocate"
|
| 42 |
+
},
|
| 43 |
+
"answerer_phase": {
|
| 44 |
+
"model_name_or_path": "Qwen/Qwen3.5-0.8B",
|
| 45 |
+
"learning_rate": 1e-06,
|
| 46 |
+
"max_steps": 50,
|
| 47 |
+
"per_device_train_batch_size": 2,
|
| 48 |
+
"gradient_accumulation_steps": 1,
|
| 49 |
+
"num_generations": 2,
|
| 50 |
+
"max_completion_length": 192,
|
| 51 |
+
"temperature": 0.4,
|
| 52 |
+
"top_p": 0.9,
|
| 53 |
+
"beta": 0.01,
|
| 54 |
+
"epsilon": 0.2,
|
| 55 |
+
"num_iterations": 1,
|
| 56 |
+
"loss_type": "dapo",
|
| 57 |
+
"scale_rewards": "group",
|
| 58 |
+
"logging_steps": 1,
|
| 59 |
+
"save_steps": 10,
|
| 60 |
+
"output_subdir": "answerer_train",
|
| 61 |
+
"use_vllm": false,
|
| 62 |
+
"vllm_mode": "colocate"
|
| 63 |
+
},
|
| 64 |
+
"lora": {
|
| 65 |
+
"r": 8,
|
| 66 |
+
"alpha": 16,
|
| 67 |
+
"dropout": 0.05,
|
| 68 |
+
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
|
| 69 |
+
"bias": "none",
|
| 70 |
+
"task_type": "CAUSAL_LM"
|
| 71 |
+
}
|
| 72 |
+
}
|
config/shared_config.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"environment": {
|
| 3 |
+
"n_users": 40,
|
| 4 |
+
"alias_density": 0.35,
|
| 5 |
+
"noise_level": 0.15,
|
| 6 |
+
"red_herring_rate": 0.1,
|
| 7 |
+
"max_steps": 18,
|
| 8 |
+
"seed": 7
|
| 9 |
+
},
|
| 10 |
+
"dataset": {
|
| 11 |
+
"mode": "canonical",
|
| 12 |
+
"metaqa_root": "metaQA",
|
| 13 |
+
"metaqa_kb_path": "",
|
| 14 |
+
"metaqa_variant": "vanilla",
|
| 15 |
+
"metaqa_hops": ["1-hop", "2-hop", "3-hop"],
|
| 16 |
+
"metaqa_splits": ["train", "dev", "test"]
|
| 17 |
+
},
|
| 18 |
+
"swarm": {
|
| 19 |
+
"enabled": true,
|
| 20 |
+
"max_agents": 3,
|
| 21 |
+
"max_breadth": 2,
|
| 22 |
+
"max_width": 2,
|
| 23 |
+
"max_depth": 2,
|
| 24 |
+
"planner_rounds": 2,
|
| 25 |
+
"tools_per_agent": 1
|
| 26 |
+
},
|
| 27 |
+
"spawn_reward": {
|
| 28 |
+
"lambda_parallel": 0.15,
|
| 29 |
+
"lambda_finish": 0.2,
|
| 30 |
+
"anneal": 1.0,
|
| 31 |
+
"max_parallel_hint": 3
|
| 32 |
+
},
|
| 33 |
+
"seeding": {
|
| 34 |
+
"seeded_nodes": [],
|
| 35 |
+
"seeded_edges": [],
|
| 36 |
+
"seeded_questions": [],
|
| 37 |
+
"llm_generate_remaining_graph": true,
|
| 38 |
+
"llm_generate_remaining_tasks": true,
|
| 39 |
+
"llm_generated_edge_budget": 6,
|
| 40 |
+
"llm_generated_task_budget": 8,
|
| 41 |
+
"llm_generation_parallel": true,
|
| 42 |
+
"llm_generation_workers": 3,
|
| 43 |
+
"llm_generation_retries": 2,
|
| 44 |
+
"allow_template_fallback_on_llm_failure": false
|
| 45 |
+
},
|
| 46 |
+
"llm": {
|
| 47 |
+
"provider": "ollama",
|
| 48 |
+
"model": "qwen3:2b",
|
| 49 |
+
"temperature": 0.1,
|
| 50 |
+
"max_tokens": 256,
|
| 51 |
+
"timeout_seconds": 240,
|
| 52 |
+
"ollama_base_url": "http://127.0.0.1:11434",
|
| 53 |
+
"openai_base_url": "https://api.openai.com/v1",
|
| 54 |
+
"openai_api_key_env": "OPENAI_API_KEY",
|
| 55 |
+
"openai_api_key": ""
|
| 56 |
+
},
|
| 57 |
+
"runtime": {
|
| 58 |
+
"default_episodes": 20,
|
| 59 |
+
"leaderboard_path": "artifacts/leaderboard.json",
|
| 60 |
+
"dashboard_path": "artifacts/osint_dashboard.html",
|
| 61 |
+
"sweep_dashboard_dir": "artifacts/sweep_dashboards"
|
| 62 |
+
}
|
| 63 |
+
}
|
config/shared_config_metaqa.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"environment": {
|
| 3 |
+
"n_users": 40,
|
| 4 |
+
"alias_density": 0.35,
|
| 5 |
+
"noise_level": 0.15,
|
| 6 |
+
"red_herring_rate": 0.1,
|
| 7 |
+
"max_steps": 18,
|
| 8 |
+
"seed": 7
|
| 9 |
+
},
|
| 10 |
+
"dataset": {
|
| 11 |
+
"mode": "metaqa",
|
| 12 |
+
"metaqa_root": "metaQA",
|
| 13 |
+
"metaqa_kb_path": "metaQA/kb.txt",
|
| 14 |
+
"metaqa_variant": "vanilla",
|
| 15 |
+
"metaqa_hops": ["1-hop", "2-hop", "3-hop"],
|
| 16 |
+
"metaqa_splits": ["train", "dev", "test"]
|
| 17 |
+
},
|
| 18 |
+
"swarm": {
|
| 19 |
+
"enabled": true,
|
| 20 |
+
"max_agents": 3,
|
| 21 |
+
"max_breadth": 2,
|
| 22 |
+
"max_width": 2,
|
| 23 |
+
"max_depth": 2,
|
| 24 |
+
"planner_rounds": 2,
|
| 25 |
+
"tools_per_agent": 1
|
| 26 |
+
},
|
| 27 |
+
"spawn_reward": {
|
| 28 |
+
"lambda_parallel": 0.15,
|
| 29 |
+
"lambda_finish": 0.2,
|
| 30 |
+
"anneal": 1.0,
|
| 31 |
+
"max_parallel_hint": 3
|
| 32 |
+
},
|
| 33 |
+
"seeding": {
|
| 34 |
+
"seeded_nodes": [],
|
| 35 |
+
"seeded_edges": [],
|
| 36 |
+
"seeded_questions": [],
|
| 37 |
+
"llm_generate_remaining_graph": false,
|
| 38 |
+
"llm_generate_remaining_tasks": false,
|
| 39 |
+
"llm_generated_edge_budget": 0,
|
| 40 |
+
"llm_generated_task_budget": 0,
|
| 41 |
+
"llm_generation_parallel": true,
|
| 42 |
+
"llm_generation_workers": 3,
|
| 43 |
+
"llm_generation_retries": 2,
|
| 44 |
+
"allow_template_fallback_on_llm_failure": false
|
| 45 |
+
},
|
| 46 |
+
"llm": {
|
| 47 |
+
"provider": "mock",
|
| 48 |
+
"model": "qwen3:2b",
|
| 49 |
+
"temperature": 0.1,
|
| 50 |
+
"max_tokens": 256,
|
| 51 |
+
"timeout_seconds": 240,
|
| 52 |
+
"ollama_base_url": "http://127.0.0.1:11434",
|
| 53 |
+
"openai_base_url": "https://api.openai.com/v1",
|
| 54 |
+
"openai_api_key_env": "OPENAI_API_KEY",
|
| 55 |
+
"openai_api_key": ""
|
| 56 |
+
},
|
| 57 |
+
"runtime": {
|
| 58 |
+
"default_episodes": 20,
|
| 59 |
+
"leaderboard_path": "artifacts/leaderboard_metaqa.json",
|
| 60 |
+
"dashboard_path": "artifacts/metaqa_dashboard.html",
|
| 61 |
+
"sweep_dashboard_dir": "artifacts/metaqa_sweep_dashboards"
|
| 62 |
+
}
|
| 63 |
+
}
|
datasets/fixed_levels/README.md
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Fixed Levels Submission Dataset
|
| 2 |
+
|
| 3 |
+
This folder contains a fixed three-level OSINT benchmark set built on one shared base graph.
|
| 4 |
+
|
| 5 |
+
## Files
|
| 6 |
+
|
| 7 |
+
- `seed_fixed_levels.json`: master fixed seed with an expanded canonical graph and 30 fixed questions.
|
| 8 |
+
- `fixed_graph_questions.json`: extracted fixed dataset snapshot for submission packaging.
|
| 9 |
+
- `shared_config_fixed_levels.json`: run config used for generation and evaluation.
|
| 10 |
+
- `complete_dataset_qwen_generated.json`: full dataset after Qwen (`qwen3:2b` via Ollama) expands the graph.
|
| 11 |
+
- `qwen_swarm_eval_fixed_levels.json`: legacy Qwen swarm evaluation summary from the older smaller version of the set.
|
| 12 |
+
- `qwen_swarm_benchmark_fixed_levels.json`: legacy benchmark output from the older smaller version of the set.
|
| 13 |
+
- `leaderboard_fixed_levels.json`: leaderboard file for this dataset.
|
| 14 |
+
- `dashboard_fixed_levels.html`: interactive dashboard generated from the benchmark run.
|
| 15 |
+
|
| 16 |
+
## Difficulty Design
|
| 17 |
+
|
| 18 |
+
- Easy: 10 questions. These now use the older hard-style multi-hop traces as the new floor.
|
| 19 |
+
- Mid: 10 questions. Each question spans roughly 15-20 supporting nodes.
|
| 20 |
+
- High: 10 questions. Each question spans roughly 50 supporting nodes.
|
| 21 |
+
|
| 22 |
+
All 30 questions are fixed and share the same larger seeded graph.
|
| 23 |
+
|
| 24 |
+
## Regenerate Artifacts
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
source ~/arl/bin/activate
|
| 28 |
+
cd /home/ritish/test1
|
| 29 |
+
PYTHONPATH=src python scripts/build_fixed_levels_dataset.py \
|
| 30 |
+
--seed-file datasets/fixed_levels/seed_fixed_levels.json \
|
| 31 |
+
--shared-config datasets/fixed_levels/shared_config_fixed_levels.json \
|
| 32 |
+
--output-dir datasets/fixed_levels
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## Evaluate Qwen Swarm
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
source ~/arl/bin/activate
|
| 39 |
+
cd /home/ritish/test1
|
| 40 |
+
PYTHONPATH=src osint-env eval \
|
| 41 |
+
--config datasets/fixed_levels/shared_config_fixed_levels.json \
|
| 42 |
+
--seed-file datasets/fixed_levels/seed_fixed_levels.json \
|
| 43 |
+
--agent-mode swarm \
|
| 44 |
+
--llm-provider ollama \
|
| 45 |
+
--llm-model qwen3:2b \
|
| 46 |
+
--episodes 15
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## Benchmark + Dashboard
|
| 50 |
+
|
| 51 |
+
```bash
|
| 52 |
+
source ~/arl/bin/activate
|
| 53 |
+
cd /home/ritish/test1
|
| 54 |
+
PYTHONPATH=src osint-env benchmark \
|
| 55 |
+
--config datasets/fixed_levels/shared_config_fixed_levels.json \
|
| 56 |
+
--seed-file datasets/fixed_levels/seed_fixed_levels.json \
|
| 57 |
+
--agent-mode swarm \
|
| 58 |
+
--llm-provider ollama \
|
| 59 |
+
--llm-model qwen3:2b \
|
| 60 |
+
--episodes 15 \
|
| 61 |
+
--name fixed_levels_qwen_swarm \
|
| 62 |
+
--leaderboard datasets/fixed_levels/leaderboard_fixed_levels.json \
|
| 63 |
+
--dashboard datasets/fixed_levels/dashboard_fixed_levels.html
|
| 64 |
+
```
|
datasets/fixed_levels/complete_dataset_qwen_generated.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/fixed_levels/fixed_graph_questions.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/fixed_levels/leaderboard_fixed_levels.json
ADDED
|
@@ -0,0 +1,1401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"config": {
|
| 4 |
+
"max_agents": 3,
|
| 5 |
+
"max_breadth": 2,
|
| 6 |
+
"max_depth": 2,
|
| 7 |
+
"max_steps": 20,
|
| 8 |
+
"max_width": 2,
|
| 9 |
+
"seed": 2026,
|
| 10 |
+
"seeded_questions": 15,
|
| 11 |
+
"swarm_enabled": true
|
| 12 |
+
},
|
| 13 |
+
"created_at": "2026-04-01T18:48:39+00:00",
|
| 14 |
+
"episodes": 15,
|
| 15 |
+
"metrics": {
|
| 16 |
+
"avg_compactness_reward": 0.0,
|
| 17 |
+
"avg_connectivity_gain_reward": 0.16666666666666666,
|
| 18 |
+
"avg_connectivity_reward": 0.16999999999999998,
|
| 19 |
+
"avg_diversity_reward": 0.1157777777777778,
|
| 20 |
+
"avg_entity_informativeness_reward": -0.08858065677817137,
|
| 21 |
+
"avg_format_reward": 0.14999999999999997,
|
| 22 |
+
"avg_graph_f1": 0.8492063492063492,
|
| 23 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 24 |
+
"avg_knowledge_indexing_reward": 0.052000000000000005,
|
| 25 |
+
"avg_relation_informativeness_reward": 0.07135858524047924,
|
| 26 |
+
"avg_reward": 4.197526826881651,
|
| 27 |
+
"avg_soft_shaping_reward": 0.24999999999999994,
|
| 28 |
+
"avg_spawn_count": 4.0,
|
| 29 |
+
"avg_spawn_critical_steps": 6.0,
|
| 30 |
+
"avg_steps_to_solution": 9.0,
|
| 31 |
+
"deanonymization_accuracy": 1.0,
|
| 32 |
+
"leaderboard_score": 0.8543934355282199,
|
| 33 |
+
"retrieval_signal": 0.6932,
|
| 34 |
+
"spawn_completion_rate": 1.0,
|
| 35 |
+
"spawn_signal": 0.6666666666666666,
|
| 36 |
+
"structural_signal": 0.5730889190257948,
|
| 37 |
+
"task_success_rate": 1.0,
|
| 38 |
+
"tool_efficiency": 0.5
|
| 39 |
+
},
|
| 40 |
+
"run_id": "run_0001",
|
| 41 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"config": {
|
| 45 |
+
"max_agents": 3,
|
| 46 |
+
"max_breadth": 2,
|
| 47 |
+
"max_depth": 2,
|
| 48 |
+
"max_steps": 24,
|
| 49 |
+
"max_width": 2,
|
| 50 |
+
"seed": 2026,
|
| 51 |
+
"seeded_questions": 30,
|
| 52 |
+
"swarm_enabled": true
|
| 53 |
+
},
|
| 54 |
+
"created_at": "2026-04-02T09:16:05+00:00",
|
| 55 |
+
"episodes": 30,
|
| 56 |
+
"metrics": {
|
| 57 |
+
"avg_compactness_reward": 0.0,
|
| 58 |
+
"avg_connectivity_gain_reward": 0.2000000000000001,
|
| 59 |
+
"avg_connectivity_reward": 0.12999999999999998,
|
| 60 |
+
"avg_diversity_reward": 0.12433333333333325,
|
| 61 |
+
"avg_entity_informativeness_reward": 0.000700571890338102,
|
| 62 |
+
"avg_format_reward": 0.15,
|
| 63 |
+
"avg_graph_f1": 0.2916528337385394,
|
| 64 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 65 |
+
"avg_knowledge_indexing_reward": 0.05070078042510192,
|
| 66 |
+
"avg_relation_informativeness_reward": 0.07853375358885142,
|
| 67 |
+
"avg_reward": 4.377456514967488,
|
| 68 |
+
"avg_soft_shaping_reward": 0.3,
|
| 69 |
+
"avg_spawn_count": 4.0,
|
| 70 |
+
"avg_spawn_critical_steps": 6.0,
|
| 71 |
+
"avg_steps_to_solution": 9.0,
|
| 72 |
+
"deanonymization_accuracy": 0.0,
|
| 73 |
+
"leaderboard_score": 0.6241912131110795,
|
| 74 |
+
"retrieval_signal": 0.6927452731487858,
|
| 75 |
+
"spawn_completion_rate": 1.0,
|
| 76 |
+
"spawn_signal": 0.6666666666666666,
|
| 77 |
+
"structural_signal": 0.5869968650958378,
|
| 78 |
+
"task_success_rate": 1.0,
|
| 79 |
+
"tool_efficiency": 0.5
|
| 80 |
+
},
|
| 81 |
+
"run_id": "run_0002",
|
| 82 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"config": {
|
| 86 |
+
"max_agents": 3,
|
| 87 |
+
"max_breadth": 2,
|
| 88 |
+
"max_depth": 2,
|
| 89 |
+
"max_steps": 24,
|
| 90 |
+
"max_width": 2,
|
| 91 |
+
"seed": 2026,
|
| 92 |
+
"seeded_questions": 30,
|
| 93 |
+
"swarm_enabled": true
|
| 94 |
+
},
|
| 95 |
+
"created_at": "2026-04-03T13:22:03+00:00",
|
| 96 |
+
"episodes": 3,
|
| 97 |
+
"metrics": {
|
| 98 |
+
"avg_compactness_reward": 0.0,
|
| 99 |
+
"avg_connectivity_gain_reward": 0.20000000000000004,
|
| 100 |
+
"avg_connectivity_reward": -0.06666666666666667,
|
| 101 |
+
"avg_diversity_reward": 0.13444444444444445,
|
| 102 |
+
"avg_entity_informativeness_reward": -0.01010882862863417,
|
| 103 |
+
"avg_format_reward": 0.15,
|
| 104 |
+
"avg_graph_f1": 0.5793650793650794,
|
| 105 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 106 |
+
"avg_knowledge_indexing_reward": 0.10372960372960373,
|
| 107 |
+
"avg_relation_informativeness_reward": 0.07108687894082726,
|
| 108 |
+
"avg_reward": 4.419313576918165,
|
| 109 |
+
"avg_soft_shaping_reward": 0.3,
|
| 110 |
+
"avg_spawn_count": 4.0,
|
| 111 |
+
"avg_spawn_critical_steps": 6.0,
|
| 112 |
+
"avg_steps_to_solution": 9.0,
|
| 113 |
+
"deanonymization_accuracy": 0.0,
|
| 114 |
+
"leaderboard_score": 0.6797400780463063,
|
| 115 |
+
"retrieval_signal": 0.7113053613053614,
|
| 116 |
+
"spawn_completion_rate": 1.0,
|
| 117 |
+
"spawn_signal": 0.6666666666666666,
|
| 118 |
+
"structural_signal": 0.5356956100624386,
|
| 119 |
+
"task_success_rate": 1.0,
|
| 120 |
+
"tool_efficiency": 0.5
|
| 121 |
+
},
|
| 122 |
+
"run_id": "run_0003",
|
| 123 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"config": {
|
| 127 |
+
"max_agents": 3,
|
| 128 |
+
"max_breadth": 2,
|
| 129 |
+
"max_depth": 2,
|
| 130 |
+
"max_steps": 24,
|
| 131 |
+
"max_width": 2,
|
| 132 |
+
"seed": 2026,
|
| 133 |
+
"seeded_questions": 30,
|
| 134 |
+
"swarm_enabled": true
|
| 135 |
+
},
|
| 136 |
+
"created_at": "2026-04-06T18:29:39+00:00",
|
| 137 |
+
"episodes": 30,
|
| 138 |
+
"metrics": {
|
| 139 |
+
"avg_compactness_reward": 0.0,
|
| 140 |
+
"avg_connectivity_gain_reward": 0.2000000000000001,
|
| 141 |
+
"avg_connectivity_reward": 0.12999999999999998,
|
| 142 |
+
"avg_diversity_reward": 0.12433333333333325,
|
| 143 |
+
"avg_entity_informativeness_reward": -0.02515191749984708,
|
| 144 |
+
"avg_format_reward": 0.15,
|
| 145 |
+
"avg_graph_f1": 0.2916528337385394,
|
| 146 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 147 |
+
"avg_knowledge_indexing_reward": 0.11539120363588044,
|
| 148 |
+
"avg_relation_informativeness_reward": 0.0769903534735767,
|
| 149 |
+
"avg_reward": 4.460667345528021,
|
| 150 |
+
"avg_soft_shaping_reward": 0.3,
|
| 151 |
+
"avg_spawn_count": 4.0,
|
| 152 |
+
"avg_spawn_critical_steps": 6.0,
|
| 153 |
+
"avg_steps_to_solution": 9.0,
|
| 154 |
+
"deanonymization_accuracy": 0.0,
|
| 155 |
+
"leaderboard_score": 0.6269168609961595,
|
| 156 |
+
"retrieval_signal": 0.7153869212725582,
|
| 157 |
+
"spawn_completion_rate": 1.0,
|
| 158 |
+
"spawn_signal": 0.6666666666666666,
|
| 159 |
+
"structural_signal": 0.5815176871947458,
|
| 160 |
+
"task_success_rate": 1.0,
|
| 161 |
+
"tool_efficiency": 0.5
|
| 162 |
+
},
|
| 163 |
+
"run_id": "run_0004",
|
| 164 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"config": {
|
| 168 |
+
"max_agents": 3,
|
| 169 |
+
"max_breadth": 2,
|
| 170 |
+
"max_depth": 2,
|
| 171 |
+
"max_steps": 24,
|
| 172 |
+
"max_width": 2,
|
| 173 |
+
"seed": 2026,
|
| 174 |
+
"seeded_questions": 30,
|
| 175 |
+
"swarm_enabled": true
|
| 176 |
+
},
|
| 177 |
+
"created_at": "2026-04-06T18:33:06+00:00",
|
| 178 |
+
"episodes": 2,
|
| 179 |
+
"metrics": {
|
| 180 |
+
"avg_compactness_reward": 0.0,
|
| 181 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 182 |
+
"avg_connectivity_reward": -0.15,
|
| 183 |
+
"avg_diversity_reward": 0.13833333333333334,
|
| 184 |
+
"avg_entity_informativeness_reward": -0.026628229842114173,
|
| 185 |
+
"avg_format_reward": 0.15,
|
| 186 |
+
"avg_graph_f1": 0.6190476190476191,
|
| 187 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 188 |
+
"avg_knowledge_indexing_reward": 0.10681818181818181,
|
| 189 |
+
"avg_relation_informativeness_reward": 0.048120982127120335,
|
| 190 |
+
"avg_reward": 4.334953339016039,
|
| 191 |
+
"avg_soft_shaping_reward": 0.3,
|
| 192 |
+
"avg_spawn_count": 4.0,
|
| 193 |
+
"avg_spawn_critical_steps": 6.0,
|
| 194 |
+
"avg_steps_to_solution": 9.0,
|
| 195 |
+
"deanonymization_accuracy": 0.0,
|
| 196 |
+
"leaderboard_score": 0.685242999396977,
|
| 197 |
+
"retrieval_signal": 0.7123863636363637,
|
| 198 |
+
"spawn_completion_rate": 1.0,
|
| 199 |
+
"spawn_signal": 0.6666666666666666,
|
| 200 |
+
"structural_signal": 0.5075485504570012,
|
| 201 |
+
"task_success_rate": 1.0,
|
| 202 |
+
"tool_efficiency": 0.5
|
| 203 |
+
},
|
| 204 |
+
"run_id": "run_0005",
|
| 205 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"config": {
|
| 209 |
+
"max_agents": 1,
|
| 210 |
+
"max_breadth": 2,
|
| 211 |
+
"max_depth": 2,
|
| 212 |
+
"max_steps": 24,
|
| 213 |
+
"max_width": 2,
|
| 214 |
+
"seed": 2026,
|
| 215 |
+
"seeded_questions": 30,
|
| 216 |
+
"swarm_enabled": true
|
| 217 |
+
},
|
| 218 |
+
"created_at": "2026-04-06T18:54:52+00:00",
|
| 219 |
+
"episodes": 1,
|
| 220 |
+
"metrics": {
|
| 221 |
+
"avg_compactness_reward": 0.0,
|
| 222 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 223 |
+
"avg_connectivity_reward": -0.3,
|
| 224 |
+
"avg_diversity_reward": 0.08,
|
| 225 |
+
"avg_entity_informativeness_reward": -0.02450859227728558,
|
| 226 |
+
"avg_format_reward": 0.15,
|
| 227 |
+
"avg_graph_f1": 0.33333333333333337,
|
| 228 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 229 |
+
"avg_knowledge_indexing_reward": 0.08181818181818182,
|
| 230 |
+
"avg_relation_informativeness_reward": 0.04353540016904645,
|
| 231 |
+
"avg_reward": 3.037246438342494,
|
| 232 |
+
"avg_soft_shaping_reward": 0.15,
|
| 233 |
+
"avg_spawn_count": 2.0,
|
| 234 |
+
"avg_spawn_critical_steps": 6.0,
|
| 235 |
+
"avg_steps_to_solution": 5.0,
|
| 236 |
+
"deanonymization_accuracy": 0.0,
|
| 237 |
+
"leaderboard_score": 0.6201263424948862,
|
| 238 |
+
"retrieval_signal": 0.7036363636363637,
|
| 239 |
+
"spawn_completion_rate": 1.0,
|
| 240 |
+
"spawn_signal": 0.6666666666666666,
|
| 241 |
+
"structural_signal": 0.45080536157835216,
|
| 242 |
+
"task_success_rate": 1.0,
|
| 243 |
+
"tool_efficiency": 0.5
|
| 244 |
+
},
|
| 245 |
+
"run_id": "run_0006",
|
| 246 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"config": {
|
| 250 |
+
"max_agents": 1,
|
| 251 |
+
"max_breadth": 2,
|
| 252 |
+
"max_depth": 2,
|
| 253 |
+
"max_steps": 24,
|
| 254 |
+
"max_width": 2,
|
| 255 |
+
"seed": 2026,
|
| 256 |
+
"seeded_questions": 30,
|
| 257 |
+
"swarm_enabled": true
|
| 258 |
+
},
|
| 259 |
+
"created_at": "2026-04-06T19:22:57+00:00",
|
| 260 |
+
"episodes": 1,
|
| 261 |
+
"metrics": {
|
| 262 |
+
"avg_compactness_reward": 0.0,
|
| 263 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 264 |
+
"avg_connectivity_reward": -0.3,
|
| 265 |
+
"avg_diversity_reward": 0.08,
|
| 266 |
+
"avg_entity_informativeness_reward": -0.005263146336646693,
|
| 267 |
+
"avg_format_reward": 0.15,
|
| 268 |
+
"avg_graph_f1": 0.33333333333333337,
|
| 269 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 270 |
+
"avg_knowledge_indexing_reward": 0.08181818181818182,
|
| 271 |
+
"avg_relation_informativeness_reward": 0.044276243254877785,
|
| 272 |
+
"avg_reward": 3.057232727368964,
|
| 273 |
+
"avg_soft_shaping_reward": 0.15,
|
| 274 |
+
"avg_spawn_count": 2.0,
|
| 275 |
+
"avg_spawn_critical_steps": 6.0,
|
| 276 |
+
"avg_steps_to_solution": 5.0,
|
| 277 |
+
"deanonymization_accuracy": 0.0,
|
| 278 |
+
"leaderboard_score": 0.6205293479318178,
|
| 279 |
+
"retrieval_signal": 0.7036363636363637,
|
| 280 |
+
"spawn_completion_rate": 1.0,
|
| 281 |
+
"spawn_signal": 0.6666666666666666,
|
| 282 |
+
"structural_signal": 0.4548026193836462,
|
| 283 |
+
"task_success_rate": 1.0,
|
| 284 |
+
"tool_efficiency": 0.5
|
| 285 |
+
},
|
| 286 |
+
"run_id": "run_0007",
|
| 287 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
"config": {
|
| 291 |
+
"llm_model": "qwen3:1.7b",
|
| 292 |
+
"llm_provider": "ollama",
|
| 293 |
+
"max_agents": 1,
|
| 294 |
+
"max_breadth": 2,
|
| 295 |
+
"max_depth": 2,
|
| 296 |
+
"max_steps": 24,
|
| 297 |
+
"max_width": 2,
|
| 298 |
+
"seed": 2026,
|
| 299 |
+
"seeded_questions": 30,
|
| 300 |
+
"swarm_enabled": true
|
| 301 |
+
},
|
| 302 |
+
"created_at": "2026-04-06T19:48:33+00:00",
|
| 303 |
+
"episodes": 3,
|
| 304 |
+
"metrics": {
|
| 305 |
+
"avg_compactness_reward": 0.0,
|
| 306 |
+
"avg_connectivity_gain_reward": 0.10000000000000002,
|
| 307 |
+
"avg_connectivity_reward": -0.09999999999999999,
|
| 308 |
+
"avg_diversity_reward": 0.08,
|
| 309 |
+
"avg_entity_informativeness_reward": -0.028683816517602444,
|
| 310 |
+
"avg_format_reward": 0.15,
|
| 311 |
+
"avg_graph_f1": 0.15537340619307835,
|
| 312 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 313 |
+
"avg_knowledge_indexing_reward": 0.07932190760059611,
|
| 314 |
+
"avg_relation_informativeness_reward": 0.044225025032092045,
|
| 315 |
+
"avg_reward": 3.1324990406542437,
|
| 316 |
+
"avg_soft_shaping_reward": 0.15,
|
| 317 |
+
"avg_spawn_count": 2.0,
|
| 318 |
+
"avg_spawn_critical_steps": 6.0,
|
| 319 |
+
"avg_steps_to_solution": 5.0,
|
| 320 |
+
"deanonymization_accuracy": 0.0,
|
| 321 |
+
"leaderboard_score": 0.5890485416309927,
|
| 322 |
+
"retrieval_signal": 0.7027626676602087,
|
| 323 |
+
"spawn_completion_rate": 1.0,
|
| 324 |
+
"spawn_signal": 0.6666666666666666,
|
| 325 |
+
"structural_signal": 0.5001082417028979,
|
| 326 |
+
"task_success_rate": 1.0,
|
| 327 |
+
"tool_efficiency": 0.5
|
| 328 |
+
},
|
| 329 |
+
"run_id": "run_0008",
|
| 330 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"config": {
|
| 334 |
+
"llm_model": "qwen3:1.7b",
|
| 335 |
+
"llm_provider": "ollama",
|
| 336 |
+
"max_agents": 1,
|
| 337 |
+
"max_breadth": 2,
|
| 338 |
+
"max_depth": 2,
|
| 339 |
+
"max_steps": 24,
|
| 340 |
+
"max_width": 2,
|
| 341 |
+
"seed": 2026,
|
| 342 |
+
"seeded_questions": 30,
|
| 343 |
+
"swarm_enabled": true
|
| 344 |
+
},
|
| 345 |
+
"created_at": "2026-04-06T19:55:08+00:00",
|
| 346 |
+
"episodes": 1,
|
| 347 |
+
"metrics": {
|
| 348 |
+
"avg_compactness_reward": 0.0,
|
| 349 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 350 |
+
"avg_connectivity_reward": -0.3,
|
| 351 |
+
"avg_diversity_reward": 0.08,
|
| 352 |
+
"avg_entity_informativeness_reward": -0.005263146336646693,
|
| 353 |
+
"avg_format_reward": 0.15,
|
| 354 |
+
"avg_graph_f1": 0.33333333333333337,
|
| 355 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 356 |
+
"avg_knowledge_indexing_reward": 0.08181818181818182,
|
| 357 |
+
"avg_relation_informativeness_reward": 0.04406984773661544,
|
| 358 |
+
"avg_reward": 3.0570263318507016,
|
| 359 |
+
"avg_soft_shaping_reward": 0.15,
|
| 360 |
+
"avg_spawn_count": 2.0,
|
| 361 |
+
"avg_spawn_critical_steps": 6.0,
|
| 362 |
+
"avg_steps_to_solution": 5.0,
|
| 363 |
+
"deanonymization_accuracy": 0.0,
|
| 364 |
+
"leaderboard_score": 0.6205251901591228,
|
| 365 |
+
"retrieval_signal": 0.7036363636363637,
|
| 366 |
+
"spawn_completion_rate": 1.0,
|
| 367 |
+
"spawn_signal": 0.6666666666666666,
|
| 368 |
+
"structural_signal": 0.45476134027999376,
|
| 369 |
+
"task_success_rate": 1.0,
|
| 370 |
+
"tool_efficiency": 0.5
|
| 371 |
+
},
|
| 372 |
+
"run_id": "run_0009",
|
| 373 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 374 |
+
},
|
| 375 |
+
{
|
| 376 |
+
"config": {
|
| 377 |
+
"llm_model": "qwen3:1.7b",
|
| 378 |
+
"llm_provider": "ollama",
|
| 379 |
+
"max_agents": 1,
|
| 380 |
+
"max_breadth": 2,
|
| 381 |
+
"max_depth": 2,
|
| 382 |
+
"max_steps": 24,
|
| 383 |
+
"max_width": 2,
|
| 384 |
+
"seed": 2026,
|
| 385 |
+
"seeded_questions": 30,
|
| 386 |
+
"swarm_enabled": true
|
| 387 |
+
},
|
| 388 |
+
"created_at": "2026-04-06T20:01:34+00:00",
|
| 389 |
+
"episodes": 1,
|
| 390 |
+
"metrics": {
|
| 391 |
+
"avg_compactness_reward": 0.0,
|
| 392 |
+
"avg_connectivity_gain_reward": 0.1,
|
| 393 |
+
"avg_connectivity_reward": -0.3,
|
| 394 |
+
"avg_diversity_reward": 0.08,
|
| 395 |
+
"avg_entity_informativeness_reward": -0.020826953461399098,
|
| 396 |
+
"avg_format_reward": 0.15,
|
| 397 |
+
"avg_graph_f1": 0.33333333333333337,
|
| 398 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 399 |
+
"avg_knowledge_indexing_reward": 0.08181818181818182,
|
| 400 |
+
"avg_relation_informativeness_reward": 0.04348043923536236,
|
| 401 |
+
"avg_reward": 3.040873116224696,
|
| 402 |
+
"avg_soft_shaping_reward": 0.15,
|
| 403 |
+
"avg_spawn_count": 2.0,
|
| 404 |
+
"avg_spawn_critical_steps": 6.0,
|
| 405 |
+
"avg_steps_to_solution": 5.0,
|
| 406 |
+
"deanonymization_accuracy": 0.0,
|
| 407 |
+
"leaderboard_score": 0.6201995296517067,
|
| 408 |
+
"retrieval_signal": 0.7036363636363637,
|
| 409 |
+
"spawn_completion_rate": 1.0,
|
| 410 |
+
"spawn_signal": 0.6666666666666666,
|
| 411 |
+
"structural_signal": 0.45153069715479266,
|
| 412 |
+
"task_success_rate": 1.0,
|
| 413 |
+
"tool_efficiency": 0.5
|
| 414 |
+
},
|
| 415 |
+
"run_id": "run_0010",
|
| 416 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"config": {
|
| 420 |
+
"max_agents": 3,
|
| 421 |
+
"max_breadth": 2,
|
| 422 |
+
"max_depth": 2,
|
| 423 |
+
"max_steps": 24,
|
| 424 |
+
"max_width": 2,
|
| 425 |
+
"seed": 2026,
|
| 426 |
+
"seeded_questions": 30,
|
| 427 |
+
"swarm_enabled": true
|
| 428 |
+
},
|
| 429 |
+
"created_at": "2026-04-06T20:46:11+00:00",
|
| 430 |
+
"episodes": 1,
|
| 431 |
+
"metrics": {
|
| 432 |
+
"avg_compactness_reward": 0.0,
|
| 433 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 434 |
+
"avg_connectivity_reward": -0.15,
|
| 435 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 436 |
+
"avg_entity_informativeness_reward": 0.019629386278697845,
|
| 437 |
+
"avg_format_reward": 0.15,
|
| 438 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 439 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 440 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 441 |
+
"avg_relation_informativeness_reward": 0.08347928023822283,
|
| 442 |
+
"avg_reward": 1.829702015111513,
|
| 443 |
+
"avg_soft_shaping_reward": 0.3,
|
| 444 |
+
"avg_spawn_count": 4.0,
|
| 445 |
+
"avg_spawn_critical_steps": 6.0,
|
| 446 |
+
"avg_steps_to_solution": 9.0,
|
| 447 |
+
"deanonymization_accuracy": 0.0,
|
| 448 |
+
"leaderboard_score": 0.6715432845394145,
|
| 449 |
+
"retrieval_signal": 0.7179545454545455,
|
| 450 |
+
"spawn_completion_rate": 1.0,
|
| 451 |
+
"spawn_signal": 0.6666666666666666,
|
| 452 |
+
"structural_signal": 0.5221217333033842,
|
| 453 |
+
"task_success_rate": 1.0,
|
| 454 |
+
"tool_efficiency": 0.5
|
| 455 |
+
},
|
| 456 |
+
"run_id": "run_0011",
|
| 457 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"config": {
|
| 461 |
+
"max_agents": 3,
|
| 462 |
+
"max_breadth": 2,
|
| 463 |
+
"max_depth": 2,
|
| 464 |
+
"max_steps": 24,
|
| 465 |
+
"max_width": 2,
|
| 466 |
+
"seed": 2026,
|
| 467 |
+
"seeded_questions": 30,
|
| 468 |
+
"swarm_enabled": true
|
| 469 |
+
},
|
| 470 |
+
"created_at": "2026-04-06T20:49:44+00:00",
|
| 471 |
+
"episodes": 1,
|
| 472 |
+
"metrics": {
|
| 473 |
+
"avg_compactness_reward": 0.0,
|
| 474 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 475 |
+
"avg_connectivity_reward": -0.15,
|
| 476 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 477 |
+
"avg_entity_informativeness_reward": 0.019629386278697845,
|
| 478 |
+
"avg_format_reward": 0.15,
|
| 479 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 480 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 481 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 482 |
+
"avg_relation_informativeness_reward": 0.08335372627068136,
|
| 483 |
+
"avg_reward": 0.7139904233885594,
|
| 484 |
+
"avg_soft_shaping_reward": 0.3,
|
| 485 |
+
"avg_spawn_count": 4.0,
|
| 486 |
+
"avg_spawn_critical_steps": 6.0,
|
| 487 |
+
"avg_steps_to_solution": 9.0,
|
| 488 |
+
"deanonymization_accuracy": 0.0,
|
| 489 |
+
"leaderboard_score": 0.6641542345113342,
|
| 490 |
+
"retrieval_signal": 0.7179545454545455,
|
| 491 |
+
"spawn_completion_rate": 1.0,
|
| 492 |
+
"spawn_signal": 0.6666666666666666,
|
| 493 |
+
"structural_signal": 0.5220966225098759,
|
| 494 |
+
"task_success_rate": 1.0,
|
| 495 |
+
"tool_efficiency": 0.5
|
| 496 |
+
},
|
| 497 |
+
"run_id": "run_0012",
|
| 498 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"config": {
|
| 502 |
+
"max_agents": 3,
|
| 503 |
+
"max_breadth": 2,
|
| 504 |
+
"max_depth": 2,
|
| 505 |
+
"max_steps": 24,
|
| 506 |
+
"max_width": 2,
|
| 507 |
+
"seed": 2026,
|
| 508 |
+
"seeded_questions": 30,
|
| 509 |
+
"swarm_enabled": true
|
| 510 |
+
},
|
| 511 |
+
"created_at": "2026-04-06T20:59:43+00:00",
|
| 512 |
+
"episodes": 1,
|
| 513 |
+
"metrics": {
|
| 514 |
+
"avg_compactness_reward": 0.0,
|
| 515 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 516 |
+
"avg_connectivity_reward": -0.15,
|
| 517 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 518 |
+
"avg_entity_informativeness_reward": 0.0036675120354726642,
|
| 519 |
+
"avg_format_reward": 0.15,
|
| 520 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 521 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 522 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 523 |
+
"avg_relation_informativeness_reward": 0.08250745620050208,
|
| 524 |
+
"avg_reward": 0.7138056720677886,
|
| 525 |
+
"avg_soft_shaping_reward": 0.3,
|
| 526 |
+
"avg_spawn_count": 4.0,
|
| 527 |
+
"avg_spawn_critical_steps": 6.0,
|
| 528 |
+
"avg_steps_to_solution": 9.0,
|
| 529 |
+
"deanonymization_accuracy": 0.0,
|
| 530 |
+
"leaderboard_score": 0.6638424503476543,
|
| 531 |
+
"retrieval_signal": 0.7179545454545455,
|
| 532 |
+
"spawn_completion_rate": 1.0,
|
| 533 |
+
"spawn_signal": 0.6666666666666666,
|
| 534 |
+
"structural_signal": 0.518734993647195,
|
| 535 |
+
"task_success_rate": 1.0,
|
| 536 |
+
"tool_efficiency": 0.5
|
| 537 |
+
},
|
| 538 |
+
"run_id": "run_0013",
|
| 539 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 540 |
+
},
|
| 541 |
+
{
|
| 542 |
+
"config": {
|
| 543 |
+
"llm_model": "gpt-5.4-mini",
|
| 544 |
+
"llm_provider": "openai",
|
| 545 |
+
"max_agents": 3,
|
| 546 |
+
"max_breadth": 2,
|
| 547 |
+
"max_depth": 2,
|
| 548 |
+
"max_steps": 24,
|
| 549 |
+
"max_width": 2,
|
| 550 |
+
"seed": 2026,
|
| 551 |
+
"seeded_questions": 30,
|
| 552 |
+
"swarm_enabled": true
|
| 553 |
+
},
|
| 554 |
+
"created_at": "2026-04-07T09:44:40+00:00",
|
| 555 |
+
"episodes": 1,
|
| 556 |
+
"metrics": {
|
| 557 |
+
"avg_compactness_reward": 0.0,
|
| 558 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 559 |
+
"avg_connectivity_reward": -0.15,
|
| 560 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 561 |
+
"avg_entity_informativeness_reward": -0.018704290877944903,
|
| 562 |
+
"avg_format_reward": 0.15,
|
| 563 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 564 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 565 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 566 |
+
"avg_relation_informativeness_reward": 0.08056039127695382,
|
| 567 |
+
"avg_reward": 0.7135379106634446,
|
| 568 |
+
"avg_soft_shaping_reward": 0.3,
|
| 569 |
+
"avg_spawn_count": 4.0,
|
| 570 |
+
"avg_spawn_critical_steps": 6.0,
|
| 571 |
+
"avg_steps_to_solution": 9.0,
|
| 572 |
+
"deanonymization_accuracy": 0.0,
|
| 573 |
+
"leaderboard_score": 0.6633913226563717,
|
| 574 |
+
"retrieval_signal": 0.7179545454545455,
|
| 575 |
+
"spawn_completion_rate": 1.0,
|
| 576 |
+
"spawn_signal": 0.6666666666666666,
|
| 577 |
+
"structural_signal": 0.5138712200798018,
|
| 578 |
+
"task_success_rate": 1.0,
|
| 579 |
+
"tool_efficiency": 0.5
|
| 580 |
+
},
|
| 581 |
+
"run_id": "run_0014",
|
| 582 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 583 |
+
},
|
| 584 |
+
{
|
| 585 |
+
"config": {
|
| 586 |
+
"llm_model": "gpt-5.4-mini",
|
| 587 |
+
"llm_provider": "openai",
|
| 588 |
+
"max_agents": 3,
|
| 589 |
+
"max_breadth": 2,
|
| 590 |
+
"max_depth": 2,
|
| 591 |
+
"max_steps": 24,
|
| 592 |
+
"max_width": 2,
|
| 593 |
+
"seed": 2026,
|
| 594 |
+
"seeded_questions": 30,
|
| 595 |
+
"swarm_enabled": true
|
| 596 |
+
},
|
| 597 |
+
"created_at": "2026-04-07T09:55:19+00:00",
|
| 598 |
+
"episodes": 1,
|
| 599 |
+
"metrics": {
|
| 600 |
+
"avg_compactness_reward": 0.0,
|
| 601 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 602 |
+
"avg_connectivity_reward": -0.15,
|
| 603 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 604 |
+
"avg_entity_informativeness_reward": -0.018704290877944903,
|
| 605 |
+
"avg_format_reward": 0.15,
|
| 606 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 607 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 608 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 609 |
+
"avg_relation_informativeness_reward": 0.08056039127695382,
|
| 610 |
+
"avg_reward": 0.7135379106634446,
|
| 611 |
+
"avg_soft_shaping_reward": 0.3,
|
| 612 |
+
"avg_spawn_count": 4.0,
|
| 613 |
+
"avg_spawn_critical_steps": 6.0,
|
| 614 |
+
"avg_steps_to_solution": 9.0,
|
| 615 |
+
"deanonymization_accuracy": 0.0,
|
| 616 |
+
"leaderboard_score": 0.6633913226563717,
|
| 617 |
+
"retrieval_signal": 0.7179545454545455,
|
| 618 |
+
"spawn_completion_rate": 1.0,
|
| 619 |
+
"spawn_signal": 0.6666666666666666,
|
| 620 |
+
"structural_signal": 0.5138712200798018,
|
| 621 |
+
"task_success_rate": 1.0,
|
| 622 |
+
"tool_efficiency": 0.5
|
| 623 |
+
},
|
| 624 |
+
"run_id": "run_0015",
|
| 625 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 626 |
+
},
|
| 627 |
+
{
|
| 628 |
+
"config": {
|
| 629 |
+
"llm_model": "gpt-5.4-mini",
|
| 630 |
+
"llm_provider": "openai",
|
| 631 |
+
"max_agents": 3,
|
| 632 |
+
"max_breadth": 2,
|
| 633 |
+
"max_depth": 2,
|
| 634 |
+
"max_steps": 24,
|
| 635 |
+
"max_width": 2,
|
| 636 |
+
"seed": 2026,
|
| 637 |
+
"seeded_questions": 30,
|
| 638 |
+
"swarm_enabled": true
|
| 639 |
+
},
|
| 640 |
+
"created_at": "2026-04-07T09:56:28+00:00",
|
| 641 |
+
"episodes": 30,
|
| 642 |
+
"metrics": {
|
| 643 |
+
"avg_compactness_reward": 0.0,
|
| 644 |
+
"avg_connectivity_gain_reward": 0.2000000000000001,
|
| 645 |
+
"avg_connectivity_reward": 0.12999999999999998,
|
| 646 |
+
"avg_diversity_reward": 0.12433333333333325,
|
| 647 |
+
"avg_entity_informativeness_reward": -0.02515191749984708,
|
| 648 |
+
"avg_format_reward": 0.15,
|
| 649 |
+
"avg_graph_f1": 0.2916528337385394,
|
| 650 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 651 |
+
"avg_knowledge_indexing_reward": 0.11539120363588044,
|
| 652 |
+
"avg_relation_informativeness_reward": 0.0769903534735767,
|
| 653 |
+
"avg_reward": 0.7150555461096118,
|
| 654 |
+
"avg_soft_shaping_reward": 0.3,
|
| 655 |
+
"avg_spawn_count": 4.0,
|
| 656 |
+
"avg_spawn_critical_steps": 6.0,
|
| 657 |
+
"avg_steps_to_solution": 9.0,
|
| 658 |
+
"deanonymization_accuracy": 0.0,
|
| 659 |
+
"leaderboard_score": 0.6132407715455404,
|
| 660 |
+
"retrieval_signal": 0.7153869212725582,
|
| 661 |
+
"spawn_completion_rate": 1.0,
|
| 662 |
+
"spawn_signal": 0.6666666666666666,
|
| 663 |
+
"structural_signal": 0.5815176871947458,
|
| 664 |
+
"task_success_rate": 1.0,
|
| 665 |
+
"tool_efficiency": 0.5
|
| 666 |
+
},
|
| 667 |
+
"run_id": "run_0016",
|
| 668 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"config": {
|
| 672 |
+
"llm_model": "gpt-5.4-mini",
|
| 673 |
+
"llm_provider": "openai",
|
| 674 |
+
"max_agents": 3,
|
| 675 |
+
"max_breadth": 2,
|
| 676 |
+
"max_depth": 2,
|
| 677 |
+
"max_steps": 24,
|
| 678 |
+
"max_width": 2,
|
| 679 |
+
"seed": 2026,
|
| 680 |
+
"seeded_questions": 30,
|
| 681 |
+
"swarm_enabled": true
|
| 682 |
+
},
|
| 683 |
+
"created_at": "2026-04-07T10:02:32+00:00",
|
| 684 |
+
"episodes": 1,
|
| 685 |
+
"metrics": {
|
| 686 |
+
"avg_compactness_reward": 0.0,
|
| 687 |
+
"avg_connectivity_gain_reward": 0.2,
|
| 688 |
+
"avg_connectivity_reward": -0.15,
|
| 689 |
+
"avg_diversity_reward": 0.12666666666666665,
|
| 690 |
+
"avg_entity_informativeness_reward": -0.018704290877944903,
|
| 691 |
+
"avg_format_reward": 0.15,
|
| 692 |
+
"avg_graph_f1": 0.5714285714285715,
|
| 693 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 694 |
+
"avg_knowledge_indexing_reward": 0.12272727272727273,
|
| 695 |
+
"avg_relation_informativeness_reward": 0.08056039127695382,
|
| 696 |
+
"avg_reward": 0.7135379106634446,
|
| 697 |
+
"avg_soft_shaping_reward": 0.3,
|
| 698 |
+
"avg_spawn_count": 4.0,
|
| 699 |
+
"avg_spawn_critical_steps": 6.0,
|
| 700 |
+
"avg_steps_to_solution": 9.0,
|
| 701 |
+
"deanonymization_accuracy": 0.0,
|
| 702 |
+
"leaderboard_score": 0.6633913226563717,
|
| 703 |
+
"retrieval_signal": 0.7179545454545455,
|
| 704 |
+
"spawn_completion_rate": 1.0,
|
| 705 |
+
"spawn_signal": 0.6666666666666666,
|
| 706 |
+
"structural_signal": 0.5138712200798018,
|
| 707 |
+
"task_success_rate": 1.0,
|
| 708 |
+
"tool_efficiency": 0.5
|
| 709 |
+
},
|
| 710 |
+
"run_id": "run_0017",
|
| 711 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 712 |
+
},
|
| 713 |
+
{
|
| 714 |
+
"config": {
|
| 715 |
+
"llm_model": "gpt-5.4-mini",
|
| 716 |
+
"llm_provider": "openai",
|
| 717 |
+
"max_agents": 3,
|
| 718 |
+
"max_breadth": 2,
|
| 719 |
+
"max_depth": 2,
|
| 720 |
+
"max_steps": 24,
|
| 721 |
+
"max_width": 2,
|
| 722 |
+
"seed": 2026,
|
| 723 |
+
"seeded_questions": 30,
|
| 724 |
+
"swarm_enabled": true
|
| 725 |
+
},
|
| 726 |
+
"created_at": "2026-04-07T10:02:49+00:00",
|
| 727 |
+
"episodes": 3,
|
| 728 |
+
"metrics": {
|
| 729 |
+
"avg_compactness_reward": 0.0,
|
| 730 |
+
"avg_connectivity_gain_reward": 0.20000000000000004,
|
| 731 |
+
"avg_connectivity_reward": -0.06666666666666667,
|
| 732 |
+
"avg_diversity_reward": 0.13444444444444445,
|
| 733 |
+
"avg_entity_informativeness_reward": -0.029992009599206938,
|
| 734 |
+
"avg_format_reward": 0.15,
|
| 735 |
+
"avg_graph_f1": 0.5793650793650794,
|
| 736 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 737 |
+
"avg_knowledge_indexing_reward": 0.10372960372960373,
|
| 738 |
+
"avg_relation_informativeness_reward": 0.06898843512226,
|
| 739 |
+
"avg_reward": 0.7133699465240085,
|
| 740 |
+
"avg_soft_shaping_reward": 0.3,
|
| 741 |
+
"avg_spawn_count": 4.0,
|
| 742 |
+
"avg_spawn_critical_steps": 6.0,
|
| 743 |
+
"avg_steps_to_solution": 9.0,
|
| 744 |
+
"deanonymization_accuracy": 0.0,
|
| 745 |
+
"leaderboard_score": 0.6656078661080486,
|
| 746 |
+
"retrieval_signal": 0.7113053613053614,
|
| 747 |
+
"spawn_completion_rate": 1.0,
|
| 748 |
+
"spawn_signal": 0.6666666666666666,
|
| 749 |
+
"structural_signal": 0.5312992851046106,
|
| 750 |
+
"task_success_rate": 1.0,
|
| 751 |
+
"tool_efficiency": 0.5
|
| 752 |
+
},
|
| 753 |
+
"run_id": "run_0018",
|
| 754 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"config": {
|
| 758 |
+
"llm_model": "gpt-5.4-mini",
|
| 759 |
+
"llm_provider": "openai",
|
| 760 |
+
"max_agents": 3,
|
| 761 |
+
"max_breadth": 2,
|
| 762 |
+
"max_depth": 2,
|
| 763 |
+
"max_steps": 24,
|
| 764 |
+
"max_width": 2,
|
| 765 |
+
"seed": 2026,
|
| 766 |
+
"seeded_questions": 30,
|
| 767 |
+
"swarm_enabled": true
|
| 768 |
+
},
|
| 769 |
+
"created_at": "2026-04-07T10:04:53+00:00",
|
| 770 |
+
"episodes": 3,
|
| 771 |
+
"metrics": {
|
| 772 |
+
"avg_compactness_reward": 0.0,
|
| 773 |
+
"avg_connectivity_gain_reward": 0.20000000000000004,
|
| 774 |
+
"avg_connectivity_reward": -0.06666666666666667,
|
| 775 |
+
"avg_diversity_reward": 0.13444444444444445,
|
| 776 |
+
"avg_entity_informativeness_reward": -0.029992009599206938,
|
| 777 |
+
"avg_format_reward": 0.15,
|
| 778 |
+
"avg_graph_f1": 0.5793650793650794,
|
| 779 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 780 |
+
"avg_knowledge_indexing_reward": 0.10372960372960373,
|
| 781 |
+
"avg_relation_informativeness_reward": 0.06898843512226,
|
| 782 |
+
"avg_reward": 0.7133699465240085,
|
| 783 |
+
"avg_soft_shaping_reward": 0.3,
|
| 784 |
+
"avg_spawn_count": 4.0,
|
| 785 |
+
"avg_spawn_critical_steps": 6.0,
|
| 786 |
+
"avg_steps_to_solution": 9.0,
|
| 787 |
+
"deanonymization_accuracy": 0.0,
|
| 788 |
+
"leaderboard_score": 0.6656078661080486,
|
| 789 |
+
"retrieval_signal": 0.7113053613053614,
|
| 790 |
+
"spawn_completion_rate": 1.0,
|
| 791 |
+
"spawn_signal": 0.6666666666666666,
|
| 792 |
+
"structural_signal": 0.5312992851046106,
|
| 793 |
+
"task_success_rate": 1.0,
|
| 794 |
+
"tool_efficiency": 0.5
|
| 795 |
+
},
|
| 796 |
+
"run_id": "run_0019",
|
| 797 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 798 |
+
},
|
| 799 |
+
{
|
| 800 |
+
"config": {
|
| 801 |
+
"llm_model": "gpt-5.4-mini",
|
| 802 |
+
"llm_provider": "openai",
|
| 803 |
+
"max_agents": 3,
|
| 804 |
+
"max_breadth": 2,
|
| 805 |
+
"max_depth": 2,
|
| 806 |
+
"max_steps": 24,
|
| 807 |
+
"max_width": 2,
|
| 808 |
+
"seed": 2026,
|
| 809 |
+
"seeded_questions": 30,
|
| 810 |
+
"swarm_enabled": true
|
| 811 |
+
},
|
| 812 |
+
"created_at": "2026-04-07T10:11:34+00:00",
|
| 813 |
+
"episodes": 3,
|
| 814 |
+
"metrics": {
|
| 815 |
+
"avg_compactness_reward": 0.0,
|
| 816 |
+
"avg_connectivity_gain_reward": 0.20000000000000004,
|
| 817 |
+
"avg_connectivity_reward": -0.06666666666666667,
|
| 818 |
+
"avg_diversity_reward": 0.13444444444444445,
|
| 819 |
+
"avg_entity_informativeness_reward": -0.029992009599206938,
|
| 820 |
+
"avg_format_reward": 0.15,
|
| 821 |
+
"avg_graph_f1": 0.5793650793650794,
|
| 822 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 823 |
+
"avg_knowledge_indexing_reward": 0.10372960372960373,
|
| 824 |
+
"avg_relation_informativeness_reward": 0.06898843512226,
|
| 825 |
+
"avg_reward": 0.7133699465240085,
|
| 826 |
+
"avg_soft_shaping_reward": 0.3,
|
| 827 |
+
"avg_spawn_count": 4.0,
|
| 828 |
+
"avg_spawn_critical_steps": 6.0,
|
| 829 |
+
"avg_steps_to_solution": 9.0,
|
| 830 |
+
"deanonymization_accuracy": 0.0,
|
| 831 |
+
"leaderboard_score": 0.6656078661080486,
|
| 832 |
+
"retrieval_signal": 0.7113053613053614,
|
| 833 |
+
"spawn_completion_rate": 1.0,
|
| 834 |
+
"spawn_signal": 0.6666666666666666,
|
| 835 |
+
"structural_signal": 0.5312992851046106,
|
| 836 |
+
"task_success_rate": 1.0,
|
| 837 |
+
"tool_efficiency": 0.5
|
| 838 |
+
},
|
| 839 |
+
"run_id": "run_0020",
|
| 840 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 841 |
+
},
|
| 842 |
+
{
|
| 843 |
+
"config": {
|
| 844 |
+
"llm_model": "gpt-5.4-mini",
|
| 845 |
+
"llm_provider": "openai",
|
| 846 |
+
"max_agents": 3,
|
| 847 |
+
"max_breadth": 2,
|
| 848 |
+
"max_depth": 2,
|
| 849 |
+
"max_steps": 24,
|
| 850 |
+
"max_width": 2,
|
| 851 |
+
"seed": 2026,
|
| 852 |
+
"seeded_questions": 30,
|
| 853 |
+
"swarm_enabled": true
|
| 854 |
+
},
|
| 855 |
+
"created_at": "2026-04-07T10:29:54+00:00",
|
| 856 |
+
"episodes": 3,
|
| 857 |
+
"metrics": {
|
| 858 |
+
"avg_compactness_reward": 0.0,
|
| 859 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 860 |
+
"avg_connectivity_reward": 0.0,
|
| 861 |
+
"avg_diversity_reward": 0.0,
|
| 862 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 863 |
+
"avg_format_reward": 0.15,
|
| 864 |
+
"avg_graph_f1": 0.0,
|
| 865 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 866 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 867 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 868 |
+
"avg_reward": 0.5519400198339021,
|
| 869 |
+
"avg_soft_shaping_reward": 0.0,
|
| 870 |
+
"avg_spawn_count": 0.0,
|
| 871 |
+
"avg_spawn_critical_steps": 0.0,
|
| 872 |
+
"avg_steps_to_solution": 1.0,
|
| 873 |
+
"deanonymization_accuracy": 0.0,
|
| 874 |
+
"leaderboard_score": 0.2785970009916951,
|
| 875 |
+
"retrieval_signal": 0.5,
|
| 876 |
+
"spawn_completion_rate": 0.0,
|
| 877 |
+
"spawn_signal": 0.4,
|
| 878 |
+
"structural_signal": 0.5,
|
| 879 |
+
"task_success_rate": 0.0,
|
| 880 |
+
"tool_efficiency": 1.0
|
| 881 |
+
},
|
| 882 |
+
"run_id": "run_0021",
|
| 883 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 884 |
+
},
|
| 885 |
+
{
|
| 886 |
+
"config": {
|
| 887 |
+
"llm_model": "gpt-5.4-mini",
|
| 888 |
+
"llm_provider": "openai",
|
| 889 |
+
"max_agents": 3,
|
| 890 |
+
"max_breadth": 2,
|
| 891 |
+
"max_depth": 2,
|
| 892 |
+
"max_steps": 24,
|
| 893 |
+
"max_width": 2,
|
| 894 |
+
"seed": 2026,
|
| 895 |
+
"seeded_questions": 30,
|
| 896 |
+
"swarm_enabled": true
|
| 897 |
+
},
|
| 898 |
+
"created_at": "2026-04-07T15:59:20+00:00",
|
| 899 |
+
"episodes": 1,
|
| 900 |
+
"metrics": {
|
| 901 |
+
"avg_compactness_reward": 0.0,
|
| 902 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 903 |
+
"avg_connectivity_reward": 0.0,
|
| 904 |
+
"avg_diversity_reward": 0.0,
|
| 905 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 906 |
+
"avg_format_reward": 0.15,
|
| 907 |
+
"avg_graph_f1": 0.0,
|
| 908 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 909 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 910 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 911 |
+
"avg_reward": 0.5519400198339021,
|
| 912 |
+
"avg_soft_shaping_reward": 0.0,
|
| 913 |
+
"avg_spawn_count": 0.0,
|
| 914 |
+
"avg_spawn_critical_steps": 0.0,
|
| 915 |
+
"avg_steps_to_solution": 1.0,
|
| 916 |
+
"deanonymization_accuracy": 0.0,
|
| 917 |
+
"leaderboard_score": 0.2785970009916951,
|
| 918 |
+
"retrieval_signal": 0.5,
|
| 919 |
+
"spawn_completion_rate": 0.0,
|
| 920 |
+
"spawn_signal": 0.4,
|
| 921 |
+
"structural_signal": 0.5,
|
| 922 |
+
"task_success_rate": 0.0,
|
| 923 |
+
"tool_efficiency": 1.0
|
| 924 |
+
},
|
| 925 |
+
"run_id": "run_0022",
|
| 926 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 927 |
+
},
|
| 928 |
+
{
|
| 929 |
+
"config": {
|
| 930 |
+
"llm_model": "gpt-5.4-mini",
|
| 931 |
+
"llm_provider": "openai",
|
| 932 |
+
"max_agents": 3,
|
| 933 |
+
"max_breadth": 2,
|
| 934 |
+
"max_depth": 2,
|
| 935 |
+
"max_steps": 24,
|
| 936 |
+
"max_width": 2,
|
| 937 |
+
"seed": 2026,
|
| 938 |
+
"seeded_questions": 30,
|
| 939 |
+
"swarm_enabled": true
|
| 940 |
+
},
|
| 941 |
+
"created_at": "2026-04-08T04:25:00+00:00",
|
| 942 |
+
"episodes": 1,
|
| 943 |
+
"metrics": {
|
| 944 |
+
"avg_compactness_reward": 0.0,
|
| 945 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 946 |
+
"avg_connectivity_reward": 0.0,
|
| 947 |
+
"avg_diversity_reward": 0.0,
|
| 948 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 949 |
+
"avg_format_reward": 0.15,
|
| 950 |
+
"avg_graph_f1": 0.0,
|
| 951 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 952 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 953 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 954 |
+
"avg_reward": 0.5519400198339021,
|
| 955 |
+
"avg_soft_shaping_reward": 0.0,
|
| 956 |
+
"avg_spawn_count": 0.0,
|
| 957 |
+
"avg_spawn_critical_steps": 0.0,
|
| 958 |
+
"avg_steps_to_solution": 1.0,
|
| 959 |
+
"deanonymization_accuracy": 0.0,
|
| 960 |
+
"leaderboard_score": 0.2785970009916951,
|
| 961 |
+
"retrieval_signal": 0.5,
|
| 962 |
+
"spawn_completion_rate": 0.0,
|
| 963 |
+
"spawn_signal": 0.4,
|
| 964 |
+
"structural_signal": 0.5,
|
| 965 |
+
"task_success_rate": 0.0,
|
| 966 |
+
"tool_efficiency": 1.0
|
| 967 |
+
},
|
| 968 |
+
"run_id": "run_0023",
|
| 969 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 970 |
+
},
|
| 971 |
+
{
|
| 972 |
+
"config": {
|
| 973 |
+
"llm_model": "gpt-5.4-mini",
|
| 974 |
+
"llm_provider": "openai",
|
| 975 |
+
"max_agents": 3,
|
| 976 |
+
"max_breadth": 2,
|
| 977 |
+
"max_depth": 2,
|
| 978 |
+
"max_steps": 24,
|
| 979 |
+
"max_width": 2,
|
| 980 |
+
"seed": 2026,
|
| 981 |
+
"seeded_questions": 30,
|
| 982 |
+
"swarm_enabled": true
|
| 983 |
+
},
|
| 984 |
+
"created_at": "2026-04-08T04:28:07+00:00",
|
| 985 |
+
"episodes": 1,
|
| 986 |
+
"metrics": {
|
| 987 |
+
"avg_compactness_reward": 0.0,
|
| 988 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 989 |
+
"avg_connectivity_reward": 0.0,
|
| 990 |
+
"avg_diversity_reward": 0.0,
|
| 991 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 992 |
+
"avg_format_reward": 0.15,
|
| 993 |
+
"avg_graph_f1": 0.0,
|
| 994 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 995 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 996 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 997 |
+
"avg_reward": 0.5519400198339021,
|
| 998 |
+
"avg_soft_shaping_reward": 0.0,
|
| 999 |
+
"avg_spawn_count": 0.0,
|
| 1000 |
+
"avg_spawn_critical_steps": 0.0,
|
| 1001 |
+
"avg_steps_to_solution": 1.0,
|
| 1002 |
+
"deanonymization_accuracy": 0.0,
|
| 1003 |
+
"leaderboard_score": 0.2785970009916951,
|
| 1004 |
+
"retrieval_signal": 0.5,
|
| 1005 |
+
"spawn_completion_rate": 0.0,
|
| 1006 |
+
"spawn_signal": 0.4,
|
| 1007 |
+
"structural_signal": 0.5,
|
| 1008 |
+
"task_success_rate": 0.0,
|
| 1009 |
+
"tool_efficiency": 1.0
|
| 1010 |
+
},
|
| 1011 |
+
"run_id": "run_0024",
|
| 1012 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1013 |
+
},
|
| 1014 |
+
{
|
| 1015 |
+
"config": {
|
| 1016 |
+
"llm_model": "gpt-5.4-mini",
|
| 1017 |
+
"llm_provider": "openai",
|
| 1018 |
+
"max_agents": 3,
|
| 1019 |
+
"max_breadth": 2,
|
| 1020 |
+
"max_depth": 2,
|
| 1021 |
+
"max_steps": 24,
|
| 1022 |
+
"max_width": 2,
|
| 1023 |
+
"seed": 2026,
|
| 1024 |
+
"seeded_questions": 30,
|
| 1025 |
+
"swarm_enabled": true
|
| 1026 |
+
},
|
| 1027 |
+
"created_at": "2026-04-08T04:39:32+00:00",
|
| 1028 |
+
"episodes": 1,
|
| 1029 |
+
"metrics": {
|
| 1030 |
+
"avg_compactness_reward": 0.0,
|
| 1031 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 1032 |
+
"avg_connectivity_reward": 0.0,
|
| 1033 |
+
"avg_diversity_reward": 0.0,
|
| 1034 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 1035 |
+
"avg_format_reward": 0.15,
|
| 1036 |
+
"avg_graph_f1": 0.0,
|
| 1037 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 1038 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 1039 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 1040 |
+
"avg_reward": 0.5519400198339021,
|
| 1041 |
+
"avg_soft_shaping_reward": 0.0,
|
| 1042 |
+
"avg_spawn_count": 0.0,
|
| 1043 |
+
"avg_spawn_critical_steps": 0.0,
|
| 1044 |
+
"avg_steps_to_solution": 1.0,
|
| 1045 |
+
"deanonymization_accuracy": 0.0,
|
| 1046 |
+
"leaderboard_score": 0.2785970009916951,
|
| 1047 |
+
"retrieval_signal": 0.5,
|
| 1048 |
+
"spawn_completion_rate": 0.0,
|
| 1049 |
+
"spawn_signal": 0.4,
|
| 1050 |
+
"structural_signal": 0.5,
|
| 1051 |
+
"task_success_rate": 0.0,
|
| 1052 |
+
"tool_efficiency": 1.0
|
| 1053 |
+
},
|
| 1054 |
+
"run_id": "run_0025",
|
| 1055 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1056 |
+
},
|
| 1057 |
+
{
|
| 1058 |
+
"config": {
|
| 1059 |
+
"llm_model": "gpt-5.4-mini",
|
| 1060 |
+
"llm_provider": "openai",
|
| 1061 |
+
"max_agents": 3,
|
| 1062 |
+
"max_breadth": 2,
|
| 1063 |
+
"max_depth": 2,
|
| 1064 |
+
"max_steps": 24,
|
| 1065 |
+
"max_width": 2,
|
| 1066 |
+
"seed": 2026,
|
| 1067 |
+
"seeded_questions": 30,
|
| 1068 |
+
"swarm_enabled": true
|
| 1069 |
+
},
|
| 1070 |
+
"created_at": "2026-04-08T04:40:21+00:00",
|
| 1071 |
+
"episodes": 30,
|
| 1072 |
+
"metrics": {
|
| 1073 |
+
"avg_compactness_reward": 0.0,
|
| 1074 |
+
"avg_connectivity_gain_reward": 0.13333333333333336,
|
| 1075 |
+
"avg_connectivity_reward": 0.09999999999999999,
|
| 1076 |
+
"avg_diversity_reward": 0.03911111111111111,
|
| 1077 |
+
"avg_entity_informativeness_reward": -0.00951758755541623,
|
| 1078 |
+
"avg_format_reward": 0.15,
|
| 1079 |
+
"avg_graph_f1": 0.08482743691314255,
|
| 1080 |
+
"avg_knowledge_carrier_reward": 0.3333333333333333,
|
| 1081 |
+
"avg_knowledge_indexing_reward": 0.0832325289772058,
|
| 1082 |
+
"avg_relation_informativeness_reward": 0.024842289016879314,
|
| 1083 |
+
"avg_reward": 0.6636425017249088,
|
| 1084 |
+
"avg_soft_shaping_reward": 0.19999999999999993,
|
| 1085 |
+
"avg_spawn_count": 2.6666666666666665,
|
| 1086 |
+
"avg_spawn_critical_steps": 4.0,
|
| 1087 |
+
"avg_steps_to_solution": 6.333333333333333,
|
| 1088 |
+
"deanonymization_accuracy": 0.0,
|
| 1089 |
+
"leaderboard_score": 0.4644798510150634,
|
| 1090 |
+
"retrieval_signal": 0.6457980518086888,
|
| 1091 |
+
"spawn_completion_rate": 1.0,
|
| 1092 |
+
"spawn_signal": 0.7,
|
| 1093 |
+
"structural_signal": 0.5472649402922927,
|
| 1094 |
+
"task_success_rate": 0.6666666666666666,
|
| 1095 |
+
"tool_efficiency": 0.5
|
| 1096 |
+
},
|
| 1097 |
+
"run_id": "run_0026",
|
| 1098 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"config": {
|
| 1102 |
+
"llm_model": "gpt-5.4-mini",
|
| 1103 |
+
"llm_provider": "openai",
|
| 1104 |
+
"max_agents": 3,
|
| 1105 |
+
"max_breadth": 2,
|
| 1106 |
+
"max_depth": 2,
|
| 1107 |
+
"max_steps": 24,
|
| 1108 |
+
"max_width": 2,
|
| 1109 |
+
"seed": 2026,
|
| 1110 |
+
"seeded_questions": 30,
|
| 1111 |
+
"swarm_enabled": true
|
| 1112 |
+
},
|
| 1113 |
+
"created_at": "2026-04-08T05:01:16+00:00",
|
| 1114 |
+
"episodes": 10,
|
| 1115 |
+
"metrics": {
|
| 1116 |
+
"avg_compactness_reward": 0.0,
|
| 1117 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 1118 |
+
"avg_connectivity_reward": 0.0,
|
| 1119 |
+
"avg_diversity_reward": 0.0,
|
| 1120 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 1121 |
+
"avg_format_reward": 0.14999999999999997,
|
| 1122 |
+
"avg_graph_f1": 0.0,
|
| 1123 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 1124 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 1125 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 1126 |
+
"avg_reward": 0.5519400198339021,
|
| 1127 |
+
"avg_soft_shaping_reward": 0.0,
|
| 1128 |
+
"avg_spawn_count": 0.0,
|
| 1129 |
+
"avg_spawn_critical_steps": 0.0,
|
| 1130 |
+
"avg_steps_to_solution": 1.0,
|
| 1131 |
+
"deanonymization_accuracy": 0.0,
|
| 1132 |
+
"leaderboard_score": 0.2785970009916951,
|
| 1133 |
+
"retrieval_signal": 0.5,
|
| 1134 |
+
"spawn_completion_rate": 0.0,
|
| 1135 |
+
"spawn_signal": 0.4,
|
| 1136 |
+
"structural_signal": 0.5,
|
| 1137 |
+
"task_success_rate": 0.0,
|
| 1138 |
+
"tool_efficiency": 1.0
|
| 1139 |
+
},
|
| 1140 |
+
"run_id": "run_0027",
|
| 1141 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1142 |
+
},
|
| 1143 |
+
{
|
| 1144 |
+
"config": {
|
| 1145 |
+
"llm_model": "gpt-5.4-mini",
|
| 1146 |
+
"llm_provider": "openai",
|
| 1147 |
+
"max_agents": 3,
|
| 1148 |
+
"max_breadth": 2,
|
| 1149 |
+
"max_depth": 2,
|
| 1150 |
+
"max_steps": 24,
|
| 1151 |
+
"max_width": 2,
|
| 1152 |
+
"seed": 2026,
|
| 1153 |
+
"seeded_questions": 30,
|
| 1154 |
+
"swarm_enabled": true
|
| 1155 |
+
},
|
| 1156 |
+
"created_at": "2026-04-08T05:01:29+00:00",
|
| 1157 |
+
"episodes": 10,
|
| 1158 |
+
"metrics": {
|
| 1159 |
+
"avg_compactness_reward": 0.0,
|
| 1160 |
+
"avg_connectivity_gain_reward": 0.19999999999999998,
|
| 1161 |
+
"avg_connectivity_reward": 0.06,
|
| 1162 |
+
"avg_diversity_reward": 0.0,
|
| 1163 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 1164 |
+
"avg_format_reward": 0.14999999999999997,
|
| 1165 |
+
"avg_graph_f1": 0.18535980927285275,
|
| 1166 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 1167 |
+
"avg_knowledge_indexing_reward": 0.09575879120879122,
|
| 1168 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 1169 |
+
"avg_reward": 0.7109638031154166,
|
| 1170 |
+
"avg_soft_shaping_reward": 0.29999999999999993,
|
| 1171 |
+
"avg_spawn_count": 4.0,
|
| 1172 |
+
"avg_spawn_critical_steps": 6.0,
|
| 1173 |
+
"avg_steps_to_solution": 9.0,
|
| 1174 |
+
"deanonymization_accuracy": 0.0,
|
| 1175 |
+
"leaderboard_score": 0.5866289994462388,
|
| 1176 |
+
"retrieval_signal": 0.708515576923077,
|
| 1177 |
+
"spawn_completion_rate": 1.0,
|
| 1178 |
+
"spawn_signal": 0.6666666666666666,
|
| 1179 |
+
"structural_signal": 0.535,
|
| 1180 |
+
"task_success_rate": 1.0,
|
| 1181 |
+
"tool_efficiency": 0.5
|
| 1182 |
+
},
|
| 1183 |
+
"run_id": "run_0028",
|
| 1184 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1185 |
+
},
|
| 1186 |
+
{
|
| 1187 |
+
"config": {
|
| 1188 |
+
"llm_model": "gpt-5.4-mini",
|
| 1189 |
+
"llm_provider": "openai",
|
| 1190 |
+
"max_agents": 3,
|
| 1191 |
+
"max_breadth": 2,
|
| 1192 |
+
"max_depth": 2,
|
| 1193 |
+
"max_steps": 24,
|
| 1194 |
+
"max_width": 2,
|
| 1195 |
+
"seed": 2026,
|
| 1196 |
+
"seeded_questions": 30,
|
| 1197 |
+
"swarm_enabled": true
|
| 1198 |
+
},
|
| 1199 |
+
"created_at": "2026-04-08T05:01:43+00:00",
|
| 1200 |
+
"episodes": 10,
|
| 1201 |
+
"metrics": {
|
| 1202 |
+
"avg_compactness_reward": 0.0,
|
| 1203 |
+
"avg_connectivity_gain_reward": 0.19999999999999998,
|
| 1204 |
+
"avg_connectivity_reward": 0.24,
|
| 1205 |
+
"avg_diversity_reward": 0.11733333333333333,
|
| 1206 |
+
"avg_entity_informativeness_reward": -0.028552762666248687,
|
| 1207 |
+
"avg_format_reward": 0.14999999999999997,
|
| 1208 |
+
"avg_graph_f1": 0.06912250146657492,
|
| 1209 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 1210 |
+
"avg_knowledge_indexing_reward": 0.15393879572282626,
|
| 1211 |
+
"avg_relation_informativeness_reward": 0.07452686705063795,
|
| 1212 |
+
"avg_reward": 0.7171006884027153,
|
| 1213 |
+
"avg_soft_shaping_reward": 0.29999999999999993,
|
| 1214 |
+
"avg_spawn_count": 4.0,
|
| 1215 |
+
"avg_spawn_critical_steps": 6.0,
|
| 1216 |
+
"avg_steps_to_solution": 9.0,
|
| 1217 |
+
"deanonymization_accuracy": 0.0,
|
| 1218 |
+
"leaderboard_score": 0.5730007362494549,
|
| 1219 |
+
"retrieval_signal": 0.7288785785029892,
|
| 1220 |
+
"spawn_completion_rate": 1.0,
|
| 1221 |
+
"spawn_signal": 0.6666666666666666,
|
| 1222 |
+
"structural_signal": 0.6067948208768779,
|
| 1223 |
+
"task_success_rate": 1.0,
|
| 1224 |
+
"tool_efficiency": 0.5
|
| 1225 |
+
},
|
| 1226 |
+
"run_id": "run_0029",
|
| 1227 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1228 |
+
},
|
| 1229 |
+
{
|
| 1230 |
+
"config": {
|
| 1231 |
+
"llm_model": "gpt-5.4-mini",
|
| 1232 |
+
"llm_provider": "openai",
|
| 1233 |
+
"max_agents": 3,
|
| 1234 |
+
"max_breadth": 2,
|
| 1235 |
+
"max_depth": 2,
|
| 1236 |
+
"max_steps": 24,
|
| 1237 |
+
"max_width": 2,
|
| 1238 |
+
"seed": 2026,
|
| 1239 |
+
"seeded_questions": 30,
|
| 1240 |
+
"swarm_enabled": true
|
| 1241 |
+
},
|
| 1242 |
+
"created_at": "2026-04-08T15:57:03+00:00",
|
| 1243 |
+
"episodes": 10,
|
| 1244 |
+
"metrics": {
|
| 1245 |
+
"avg_compactness_reward": 0.0,
|
| 1246 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 1247 |
+
"avg_connectivity_reward": 0.0,
|
| 1248 |
+
"avg_diversity_reward": 0.0,
|
| 1249 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 1250 |
+
"avg_format_reward": 0.14999999999999997,
|
| 1251 |
+
"avg_graph_f1": 0.0,
|
| 1252 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 1253 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 1254 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 1255 |
+
"avg_reward": 0.5519400198339021,
|
| 1256 |
+
"avg_soft_shaping_reward": 0.0,
|
| 1257 |
+
"avg_spawn_count": 0.0,
|
| 1258 |
+
"avg_spawn_critical_steps": 0.0,
|
| 1259 |
+
"avg_steps_to_solution": 1.0,
|
| 1260 |
+
"deanonymization_accuracy": 0.0,
|
| 1261 |
+
"leaderboard_score": 0.2785970009916951,
|
| 1262 |
+
"retrieval_signal": 0.5,
|
| 1263 |
+
"spawn_completion_rate": 0.0,
|
| 1264 |
+
"spawn_signal": 0.4,
|
| 1265 |
+
"structural_signal": 0.5,
|
| 1266 |
+
"task_success_rate": 0.0,
|
| 1267 |
+
"tool_efficiency": 1.0
|
| 1268 |
+
},
|
| 1269 |
+
"run_id": "run_0030",
|
| 1270 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1271 |
+
},
|
| 1272 |
+
{
|
| 1273 |
+
"config": {
|
| 1274 |
+
"llm_model": "gpt-5.4-mini",
|
| 1275 |
+
"llm_provider": "openai",
|
| 1276 |
+
"max_agents": 3,
|
| 1277 |
+
"max_breadth": 2,
|
| 1278 |
+
"max_depth": 2,
|
| 1279 |
+
"max_steps": 24,
|
| 1280 |
+
"max_width": 2,
|
| 1281 |
+
"seed": 2026,
|
| 1282 |
+
"seeded_questions": 30,
|
| 1283 |
+
"swarm_enabled": true
|
| 1284 |
+
},
|
| 1285 |
+
"created_at": "2026-04-08T15:57:18+00:00",
|
| 1286 |
+
"episodes": 10,
|
| 1287 |
+
"metrics": {
|
| 1288 |
+
"avg_compactness_reward": 0.0,
|
| 1289 |
+
"avg_connectivity_gain_reward": 0.19999999999999998,
|
| 1290 |
+
"avg_connectivity_reward": 0.06,
|
| 1291 |
+
"avg_diversity_reward": 0.0,
|
| 1292 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 1293 |
+
"avg_format_reward": 0.14999999999999997,
|
| 1294 |
+
"avg_graph_f1": 0.18535980927285275,
|
| 1295 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 1296 |
+
"avg_knowledge_indexing_reward": 0.09575879120879122,
|
| 1297 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 1298 |
+
"avg_reward": 0.7109638031154166,
|
| 1299 |
+
"avg_soft_shaping_reward": 0.29999999999999993,
|
| 1300 |
+
"avg_spawn_count": 4.0,
|
| 1301 |
+
"avg_spawn_critical_steps": 6.0,
|
| 1302 |
+
"avg_steps_to_solution": 9.0,
|
| 1303 |
+
"deanonymization_accuracy": 0.0,
|
| 1304 |
+
"leaderboard_score": 0.5866289994462388,
|
| 1305 |
+
"retrieval_signal": 0.708515576923077,
|
| 1306 |
+
"spawn_completion_rate": 1.0,
|
| 1307 |
+
"spawn_signal": 0.6666666666666666,
|
| 1308 |
+
"structural_signal": 0.535,
|
| 1309 |
+
"task_success_rate": 1.0,
|
| 1310 |
+
"tool_efficiency": 0.5
|
| 1311 |
+
},
|
| 1312 |
+
"run_id": "run_0031",
|
| 1313 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1314 |
+
},
|
| 1315 |
+
{
|
| 1316 |
+
"config": {
|
| 1317 |
+
"llm_model": "gpt-5.4-mini",
|
| 1318 |
+
"llm_provider": "openai",
|
| 1319 |
+
"max_agents": 3,
|
| 1320 |
+
"max_breadth": 2,
|
| 1321 |
+
"max_depth": 2,
|
| 1322 |
+
"max_steps": 24,
|
| 1323 |
+
"max_width": 2,
|
| 1324 |
+
"seed": 2026,
|
| 1325 |
+
"seeded_questions": 30,
|
| 1326 |
+
"swarm_enabled": true
|
| 1327 |
+
},
|
| 1328 |
+
"created_at": "2026-04-08T15:57:32+00:00",
|
| 1329 |
+
"episodes": 10,
|
| 1330 |
+
"metrics": {
|
| 1331 |
+
"avg_compactness_reward": 0.0,
|
| 1332 |
+
"avg_connectivity_gain_reward": 0.19999999999999998,
|
| 1333 |
+
"avg_connectivity_reward": 0.24,
|
| 1334 |
+
"avg_diversity_reward": 0.11733333333333333,
|
| 1335 |
+
"avg_entity_informativeness_reward": -0.028552762666248687,
|
| 1336 |
+
"avg_format_reward": 0.14999999999999997,
|
| 1337 |
+
"avg_graph_f1": 0.06912250146657492,
|
| 1338 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 1339 |
+
"avg_knowledge_indexing_reward": 0.15393879572282626,
|
| 1340 |
+
"avg_relation_informativeness_reward": 0.07452686705063795,
|
| 1341 |
+
"avg_reward": 0.7171006884027153,
|
| 1342 |
+
"avg_soft_shaping_reward": 0.29999999999999993,
|
| 1343 |
+
"avg_spawn_count": 4.0,
|
| 1344 |
+
"avg_spawn_critical_steps": 6.0,
|
| 1345 |
+
"avg_steps_to_solution": 9.0,
|
| 1346 |
+
"deanonymization_accuracy": 0.0,
|
| 1347 |
+
"leaderboard_score": 0.5730007362494549,
|
| 1348 |
+
"retrieval_signal": 0.7288785785029892,
|
| 1349 |
+
"spawn_completion_rate": 1.0,
|
| 1350 |
+
"spawn_signal": 0.6666666666666666,
|
| 1351 |
+
"structural_signal": 0.6067948208768779,
|
| 1352 |
+
"task_success_rate": 1.0,
|
| 1353 |
+
"tool_efficiency": 0.5
|
| 1354 |
+
},
|
| 1355 |
+
"run_id": "run_0032",
|
| 1356 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1357 |
+
},
|
| 1358 |
+
{
|
| 1359 |
+
"config": {
|
| 1360 |
+
"llm_model": "qwen3:8b",
|
| 1361 |
+
"llm_provider": "openai",
|
| 1362 |
+
"max_agents": 3,
|
| 1363 |
+
"max_breadth": 2,
|
| 1364 |
+
"max_depth": 2,
|
| 1365 |
+
"max_steps": 18,
|
| 1366 |
+
"max_width": 2,
|
| 1367 |
+
"seed": 7,
|
| 1368 |
+
"seeded_questions": 30,
|
| 1369 |
+
"swarm_enabled": true
|
| 1370 |
+
},
|
| 1371 |
+
"created_at": "2026-04-20T19:46:04+00:00",
|
| 1372 |
+
"episodes": 1,
|
| 1373 |
+
"metrics": {
|
| 1374 |
+
"avg_compactness_reward": 0.0,
|
| 1375 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 1376 |
+
"avg_connectivity_reward": 0.0,
|
| 1377 |
+
"avg_diversity_reward": 0.0,
|
| 1378 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 1379 |
+
"avg_format_reward": 0.15,
|
| 1380 |
+
"avg_graph_f1": 0.0,
|
| 1381 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 1382 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 1383 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 1384 |
+
"avg_reward": 0.5519400198339021,
|
| 1385 |
+
"avg_soft_shaping_reward": 0.0,
|
| 1386 |
+
"avg_spawn_count": 0.0,
|
| 1387 |
+
"avg_spawn_critical_steps": 0.0,
|
| 1388 |
+
"avg_steps_to_solution": 1.0,
|
| 1389 |
+
"deanonymization_accuracy": 0.0,
|
| 1390 |
+
"leaderboard_score": 0.2785970009916951,
|
| 1391 |
+
"retrieval_signal": 0.5,
|
| 1392 |
+
"spawn_completion_rate": 0.0,
|
| 1393 |
+
"spawn_signal": 0.4,
|
| 1394 |
+
"structural_signal": 0.5,
|
| 1395 |
+
"task_success_rate": 0.0,
|
| 1396 |
+
"tool_efficiency": 1.0
|
| 1397 |
+
},
|
| 1398 |
+
"run_id": "run_0033",
|
| 1399 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 1400 |
+
}
|
| 1401 |
+
]
|
datasets/fixed_levels/qwen_swarm_benchmark_fixed_levels.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dashboard": "datasets/fixed_levels/dashboard_fixed_levels.html",
|
| 3 |
+
"record": {
|
| 4 |
+
"config": {
|
| 5 |
+
"max_agents": 3,
|
| 6 |
+
"max_breadth": 2,
|
| 7 |
+
"max_depth": 2,
|
| 8 |
+
"max_steps": 20,
|
| 9 |
+
"max_width": 2,
|
| 10 |
+
"seed": 2026,
|
| 11 |
+
"seeded_questions": 15,
|
| 12 |
+
"swarm_enabled": true
|
| 13 |
+
},
|
| 14 |
+
"created_at": "2026-04-01T18:48:39+00:00",
|
| 15 |
+
"episodes": 15,
|
| 16 |
+
"metrics": {
|
| 17 |
+
"avg_compactness_reward": 0.0,
|
| 18 |
+
"avg_connectivity_gain_reward": 0.16666666666666666,
|
| 19 |
+
"avg_connectivity_reward": 0.16999999999999998,
|
| 20 |
+
"avg_diversity_reward": 0.1157777777777778,
|
| 21 |
+
"avg_entity_informativeness_reward": -0.08858065677817137,
|
| 22 |
+
"avg_format_reward": 0.14999999999999997,
|
| 23 |
+
"avg_graph_f1": 0.8492063492063492,
|
| 24 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 25 |
+
"avg_knowledge_indexing_reward": 0.052000000000000005,
|
| 26 |
+
"avg_relation_informativeness_reward": 0.07135858524047924,
|
| 27 |
+
"avg_reward": 4.197526826881651,
|
| 28 |
+
"avg_soft_shaping_reward": 0.24999999999999994,
|
| 29 |
+
"avg_spawn_count": 4.0,
|
| 30 |
+
"avg_spawn_critical_steps": 6.0,
|
| 31 |
+
"avg_steps_to_solution": 9.0,
|
| 32 |
+
"deanonymization_accuracy": 1.0,
|
| 33 |
+
"leaderboard_score": 0.8543934355282199,
|
| 34 |
+
"retrieval_signal": 0.6932,
|
| 35 |
+
"spawn_completion_rate": 1.0,
|
| 36 |
+
"spawn_signal": 0.6666666666666666,
|
| 37 |
+
"structural_signal": 0.5730889190257948,
|
| 38 |
+
"task_success_rate": 1.0,
|
| 39 |
+
"tool_efficiency": 0.5
|
| 40 |
+
},
|
| 41 |
+
"run_id": "run_0001",
|
| 42 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 43 |
+
},
|
| 44 |
+
"summary": {
|
| 45 |
+
"avg_compactness_reward": 0.0,
|
| 46 |
+
"avg_connectivity_gain_reward": 0.16666666666666666,
|
| 47 |
+
"avg_connectivity_reward": 0.16999999999999998,
|
| 48 |
+
"avg_diversity_reward": 0.1157777777777778,
|
| 49 |
+
"avg_entity_informativeness_reward": -0.08858065677817137,
|
| 50 |
+
"avg_format_reward": 0.14999999999999997,
|
| 51 |
+
"avg_graph_f1": 0.8492063492063492,
|
| 52 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 53 |
+
"avg_knowledge_indexing_reward": 0.052000000000000005,
|
| 54 |
+
"avg_relation_informativeness_reward": 0.07135858524047924,
|
| 55 |
+
"avg_reward": 4.197526826881651,
|
| 56 |
+
"avg_soft_shaping_reward": 0.24999999999999994,
|
| 57 |
+
"avg_spawn_count": 4.0,
|
| 58 |
+
"avg_spawn_critical_steps": 6.0,
|
| 59 |
+
"avg_steps_to_solution": 9.0,
|
| 60 |
+
"deanonymization_accuracy": 1.0,
|
| 61 |
+
"leaderboard_score": 0.8543934355282199,
|
| 62 |
+
"retrieval_signal": 0.6932,
|
| 63 |
+
"spawn_completion_rate": 1.0,
|
| 64 |
+
"spawn_signal": 0.6666666666666666,
|
| 65 |
+
"structural_signal": 0.5730889190257948,
|
| 66 |
+
"task_success_rate": 1.0,
|
| 67 |
+
"tool_efficiency": 0.5
|
| 68 |
+
}
|
| 69 |
+
}
|
datasets/fixed_levels/qwen_swarm_eval_by_difficulty.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"by_difficulty": {
|
| 3 |
+
"easy": {
|
| 4 |
+
"avg_graph_f1": 1.0,
|
| 5 |
+
"avg_reward": 3.610490808845623,
|
| 6 |
+
"avg_steps": 9.0,
|
| 7 |
+
"avg_tool_calls": 4.0,
|
| 8 |
+
"episodes": 5,
|
| 9 |
+
"task_success_rate": 1.0
|
| 10 |
+
},
|
| 11 |
+
"high": {
|
| 12 |
+
"avg_graph_f1": 0.5476190476190477,
|
| 13 |
+
"avg_reward": 4.207102815893519,
|
| 14 |
+
"avg_steps": 9.0,
|
| 15 |
+
"avg_tool_calls": 4.0,
|
| 16 |
+
"episodes": 5,
|
| 17 |
+
"task_success_rate": 1.0
|
| 18 |
+
},
|
| 19 |
+
"mid": {
|
| 20 |
+
"avg_graph_f1": 1.0,
|
| 21 |
+
"avg_reward": 4.822687547070801,
|
| 22 |
+
"avg_steps": 9.0,
|
| 23 |
+
"avg_tool_calls": 4.0,
|
| 24 |
+
"episodes": 5,
|
| 25 |
+
"task_success_rate": 1.0
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"overall": {
|
| 29 |
+
"avg_compactness_reward": 0.0,
|
| 30 |
+
"avg_connectivity_gain_reward": 0.16666666666666666,
|
| 31 |
+
"avg_connectivity_reward": 0.16999999999999998,
|
| 32 |
+
"avg_diversity_reward": 0.1157777777777778,
|
| 33 |
+
"avg_entity_informativeness_reward": -0.07289878447762359,
|
| 34 |
+
"avg_format_reward": 0.14999999999999997,
|
| 35 |
+
"avg_graph_f1": 0.8492063492063492,
|
| 36 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 37 |
+
"avg_knowledge_indexing_reward": 0.052000000000000005,
|
| 38 |
+
"avg_relation_informativeness_reward": 0.07157694332826091,
|
| 39 |
+
"avg_reward": 4.213427057269981,
|
| 40 |
+
"avg_soft_shaping_reward": 0.24999999999999994,
|
| 41 |
+
"avg_spawn_count": 4.0,
|
| 42 |
+
"avg_spawn_critical_steps": 6.0,
|
| 43 |
+
"avg_steps_to_solution": 9.0,
|
| 44 |
+
"deanonymization_accuracy": 1.0,
|
| 45 |
+
"leaderboard_score": 0.8546911504342771,
|
| 46 |
+
"retrieval_signal": 0.6932,
|
| 47 |
+
"spawn_completion_rate": 1.0,
|
| 48 |
+
"spawn_signal": 0.6666666666666666,
|
| 49 |
+
"structural_signal": 0.5762689651034608,
|
| 50 |
+
"task_success_rate": 1.0,
|
| 51 |
+
"tool_efficiency": 0.5
|
| 52 |
+
}
|
| 53 |
+
}
|
datasets/fixed_levels/qwen_swarm_eval_fixed_levels.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"avg_compactness_reward": 0.0,
|
| 3 |
+
"avg_connectivity_gain_reward": 0.16666666666666666,
|
| 4 |
+
"avg_connectivity_reward": 0.16999999999999998,
|
| 5 |
+
"avg_diversity_reward": 0.1157777777777778,
|
| 6 |
+
"avg_entity_informativeness_reward": -0.02824631570420193,
|
| 7 |
+
"avg_format_reward": 0.14999999999999997,
|
| 8 |
+
"avg_graph_f1": 0.8492063492063492,
|
| 9 |
+
"avg_knowledge_carrier_reward": 0.5,
|
| 10 |
+
"avg_knowledge_indexing_reward": 0.07400000000000001,
|
| 11 |
+
"avg_relation_informativeness_reward": 0.06905976285357758,
|
| 12 |
+
"avg_reward": 4.285384567790942,
|
| 13 |
+
"avg_soft_shaping_reward": 0.24999999999999994,
|
| 14 |
+
"avg_spawn_count": 4.0,
|
| 15 |
+
"avg_spawn_critical_steps": 6.0,
|
| 16 |
+
"avg_steps_to_solution": 9.0,
|
| 17 |
+
"deanonymization_accuracy": 1.0,
|
| 18 |
+
"leaderboard_score": 0.8565775118852701,
|
| 19 |
+
"retrieval_signal": 0.7009000000000001,
|
| 20 |
+
"spawn_completion_rate": 1.0,
|
| 21 |
+
"spawn_signal": 0.6666666666666666,
|
| 22 |
+
"structural_signal": 0.5846960227632085,
|
| 23 |
+
"task_success_rate": 1.0,
|
| 24 |
+
"tool_efficiency": 0.5
|
| 25 |
+
}
|
datasets/fixed_levels/seed_fixed_levels.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasets/fixed_levels/shared_config_fixed_levels.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"environment": {
|
| 3 |
+
"n_users": 24,
|
| 4 |
+
"alias_density": 0.2,
|
| 5 |
+
"noise_level": 0.12,
|
| 6 |
+
"red_herring_rate": 0.08,
|
| 7 |
+
"max_steps": 24,
|
| 8 |
+
"seed": 2026
|
| 9 |
+
},
|
| 10 |
+
"dataset": {
|
| 11 |
+
"mode": "canonical",
|
| 12 |
+
"metaqa_root": "metaQA",
|
| 13 |
+
"metaqa_kb_path": "",
|
| 14 |
+
"metaqa_variant": "vanilla",
|
| 15 |
+
"metaqa_hops": ["1-hop", "2-hop", "3-hop"],
|
| 16 |
+
"metaqa_splits": ["train", "dev", "test"]
|
| 17 |
+
},
|
| 18 |
+
"swarm": {
|
| 19 |
+
"enabled": true,
|
| 20 |
+
"max_agents": 3,
|
| 21 |
+
"max_breadth": 2,
|
| 22 |
+
"max_width": 2,
|
| 23 |
+
"max_depth": 2,
|
| 24 |
+
"planner_rounds": 2,
|
| 25 |
+
"tools_per_agent": 1
|
| 26 |
+
},
|
| 27 |
+
"spawn_reward": {
|
| 28 |
+
"lambda_parallel": 0.15,
|
| 29 |
+
"lambda_finish": 0.2,
|
| 30 |
+
"anneal": 1.0,
|
| 31 |
+
"max_parallel_hint": 3
|
| 32 |
+
},
|
| 33 |
+
"seeding": {
|
| 34 |
+
"seeded_nodes": [],
|
| 35 |
+
"seeded_edges": [],
|
| 36 |
+
"seeded_questions": [],
|
| 37 |
+
"llm_generate_remaining_graph": true,
|
| 38 |
+
"llm_generate_remaining_tasks": false,
|
| 39 |
+
"llm_generated_edge_budget": 64,
|
| 40 |
+
"llm_generated_task_budget": 0,
|
| 41 |
+
"llm_generation_parallel": true,
|
| 42 |
+
"llm_generation_workers": 4,
|
| 43 |
+
"llm_generation_retries": 3,
|
| 44 |
+
"allow_template_fallback_on_llm_failure": false
|
| 45 |
+
},
|
| 46 |
+
"llm": {
|
| 47 |
+
"provider": "ollama",
|
| 48 |
+
"model": "qwen3:2b",
|
| 49 |
+
"temperature": 0.05,
|
| 50 |
+
"max_tokens": 384,
|
| 51 |
+
"timeout_seconds": 240,
|
| 52 |
+
"ollama_base_url": "http://127.0.0.1:11434",
|
| 53 |
+
"openai_base_url": "https://api.openai.com/v1",
|
| 54 |
+
"openai_api_key_env": "OPENAI_API_KEY",
|
| 55 |
+
"openai_api_key": ""
|
| 56 |
+
},
|
| 57 |
+
"runtime": {
|
| 58 |
+
"default_episodes": 30,
|
| 59 |
+
"leaderboard_path": "datasets/fixed_levels/leaderboard_fixed_levels.json",
|
| 60 |
+
"dashboard_path": "datasets/fixed_levels/dashboard_fixed_levels.html",
|
| 61 |
+
"sweep_dashboard_dir": "datasets/fixed_levels/sweep_dashboards"
|
| 62 |
+
}
|
| 63 |
+
}
|
docs/adversarial_self_play.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Adversarial Self-Play Training (Kimi-Style + TRL)
|
| 2 |
+
|
| 3 |
+
This repository now includes a code scaffold for alternating adversarial self-play with Hugging Face TRL.
|
| 4 |
+
|
| 5 |
+
## Goal
|
| 6 |
+
|
| 7 |
+
Train two policies in alternating rounds:
|
| 8 |
+
|
| 9 |
+
- Generator policy: proposes hard OSINT tasks (question + answer + supporting edges).
|
| 10 |
+
- Answerer policy: solves tasks proposed by the generator.
|
| 11 |
+
|
| 12 |
+
The loop is intended to move from static evaluation toward on-policy co-evolution.
|
| 13 |
+
|
| 14 |
+
## Kimi-style Objective Mapping
|
| 15 |
+
|
| 16 |
+
The implementation maps the requested Kimi-style ingredients onto TRL GRPO as follows:
|
| 17 |
+
|
| 18 |
+
- Grouped rollouts: `num_generations` in each GRPO phase.
|
| 19 |
+
- Relative reward baseline: GRPO group-relative advantages.
|
| 20 |
+
- Clipped policy updates: `epsilon` clipping in GRPO objective.
|
| 21 |
+
- KL/reference regularization: `beta` in GRPOConfig.
|
| 22 |
+
- Token-level online RL behavior: GRPO online generation with reward functions.
|
| 23 |
+
- Toggle schedule: explicit alternating generator and answerer rounds.
|
| 24 |
+
|
| 25 |
+
## Topology and Scheduling Options
|
| 26 |
+
|
| 27 |
+
- `model_topology: "dual"`: train separate generator and answerer models.
|
| 28 |
+
- `model_topology: "shared"`: train one shared model for both roles.
|
| 29 |
+
- Use `shared_model_name_or_path` to set the common base checkpoint.
|
| 30 |
+
- `phase_schedule: "generator_answerer"`: default two-phase loop per round.
|
| 31 |
+
- `phase_schedule: "answerer_generator_answerer"`: solver-first curriculum:
|
| 32 |
+
1. Train answerer on current adversarial pool.
|
| 33 |
+
2. Freeze that answerer snapshot while training generator against it.
|
| 34 |
+
3. Train answerer again on newly generated adversarial tasks.
|
| 35 |
+
|
| 36 |
+
This directly supports the "train solver, freeze, attack, retrain solver" sequence.
|
| 37 |
+
|
| 38 |
+
## Canonical Graph Mode
|
| 39 |
+
|
| 40 |
+
- `canonical_graph_mode: "generate"` (default): generator can propose canonical graph updates in `swarm_v2`.
|
| 41 |
+
- `canonical_graph_mode: "fixed"`: canonical graph candidates are held fixed per prompt, so training focuses on question/answer behavior over stable graph structure.
|
| 42 |
+
|
| 43 |
+
## Tuning Modes
|
| 44 |
+
|
| 45 |
+
- `tuning_mode: "full"`: full-model GRPO fine-tuning.
|
| 46 |
+
- `tuning_mode: "lora"`: PEFT LoRA adapters for GRPO updates.
|
| 47 |
+
- Configure via `lora` block: `r`, `alpha`, `dropout`, `target_modules`, `bias`, `task_type`.
|
| 48 |
+
|
| 49 |
+
## Reward Design
|
| 50 |
+
|
| 51 |
+
### Generator (adversarial swarm)
|
| 52 |
+
|
| 53 |
+
`GeneratorRewardFunction` combines weighted components:
|
| 54 |
+
|
| 55 |
+
- Validity: checks parsable task fields and bounded support-edge size.
|
| 56 |
+
- Hardness: rewards questions the frozen answerer currently gets wrong.
|
| 57 |
+
- Diversity: penalizes near-duplicate questions via token-overlap similarity.
|
| 58 |
+
- Consistency: rewards edge/answer/question grounding against canonical graph context.
|
| 59 |
+
|
| 60 |
+
Weights are configurable in `generator_reward_weights`.
|
| 61 |
+
|
| 62 |
+
### Answerer (existing reward integration)
|
| 63 |
+
|
| 64 |
+
`AnswererRewardFunction` wraps existing environment reward logic:
|
| 65 |
+
|
| 66 |
+
- Reuses `compute_answer_reward` from `src/osint_env/env/reward.py`.
|
| 67 |
+
- Builds transient `TaskInstance` objects from training rows.
|
| 68 |
+
- Preserves difficulty-aware reward behavior (`easy` / `medium` / `hard`).
|
| 69 |
+
|
| 70 |
+
## Entry Points
|
| 71 |
+
|
| 72 |
+
- CLI command: `osint-env train-self-play`
|
| 73 |
+
- Main runner: `src/osint_env/training/self_play.py`
|
| 74 |
+
- Config loader: `src/osint_env/training/config.py`
|
| 75 |
+
- Reward functions: `src/osint_env/training/rewards.py`
|
| 76 |
+
- Example config: `config/self_play_training_example.json`
|
| 77 |
+
|
| 78 |
+
## Dry Run Mode
|
| 79 |
+
|
| 80 |
+
The example config sets `dry_run: true` by default.
|
| 81 |
+
|
| 82 |
+
In dry run mode, the pipeline still:
|
| 83 |
+
|
| 84 |
+
- Materializes generator/answerer datasets per round.
|
| 85 |
+
- Materializes optional `answerer_pre_dataset` when using solver-first schedule.
|
| 86 |
+
- Produces generated-task artifacts (fallback generator path).
|
| 87 |
+
- Writes a full run summary.
|
| 88 |
+
|
| 89 |
+
But it skips expensive GRPO updates.
|
| 90 |
+
|
| 91 |
+
## Compute Mode
|
| 92 |
+
|
| 93 |
+
When compute is available:
|
| 94 |
+
|
| 95 |
+
1. Install train dependencies: `python -m pip install -e ".[train]"`
|
| 96 |
+
2. Disable dry run (`--dry-run` off and/or `"dry_run": false` in config).
|
| 97 |
+
3. Run `osint-env train-self-play`.
|
| 98 |
+
|
| 99 |
+
Outputs are written under `artifacts/self_play` unless overridden.
|
docs/reward_design_notes.md
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reward Design Notes
|
| 2 |
+
|
| 3 |
+
This environment uses a composite reward that adapts ideas from:
|
| 4 |
+
|
| 5 |
+
- AutoGraph-R1 (arXiv:2510.15339)
|
| 6 |
+
- UniRel (arXiv:2512.17043)
|
| 7 |
+
- DeepPath (EMNLP 2017, D17-1060)
|
| 8 |
+
- Multi-Hop KG Reasoning with Reward Shaping (EMNLP 2018, D18-1362)
|
| 9 |
+
- Kimi K2.5 (arXiv:2602.02276) for PARL-style swarm auxiliary shaping
|
| 10 |
+
|
| 11 |
+
Additional related context consulted:
|
| 12 |
+
|
| 13 |
+
- MINERVA (arXiv:1711.05851) for query-conditioned walk-style reasoning over KG paths.
|
| 14 |
+
|
| 15 |
+
## Components in this Branch
|
| 16 |
+
|
| 17 |
+
The implementation follows a staged reward design:
|
| 18 |
+
|
| 19 |
+
1. edge-level rewards during graph construction (`ADD_EDGE`)
|
| 20 |
+
2. answer-level rewards for retrieval usefulness and final task utility (`ANSWER`)
|
| 21 |
+
3. evaluation-level composite leaderboard score for benchmark ranking
|
| 22 |
+
|
| 23 |
+
### 1) Edge addition reward
|
| 24 |
+
|
| 25 |
+
For each `ADD_EDGE`, the reward combines:
|
| 26 |
+
|
| 27 |
+
- Global accuracy term (DeepPath):
|
| 28 |
+
- $r_{global} = +1$ if a candidate edge is correct, else $-1$ (scaled in code for stability).
|
| 29 |
+
- Soft shaping term (D18 reward shaping):
|
| 30 |
+
- $R = R_b + (1 - R_b) f(s, r, o)$, where $f$ is a soft fact plausibility score.
|
| 31 |
+
- In code, $f$ is approximated by relation/type priors plus small domain priors.
|
| 32 |
+
- Efficiency term (DeepPath):
|
| 33 |
+
- $r_{efficiency} \propto 1 / \text{step\_count}$.
|
| 34 |
+
- Diversity term (DeepPath):
|
| 35 |
+
- novelty from cosine dissimilarity of edge signatures; repeated patterns are down-weighted.
|
| 36 |
+
- Relation/entity informativeness (UniRel):
|
| 37 |
+
- relation rarity via normalized IDF of relation labels,
|
| 38 |
+
- entity informativeness via inverse hub-penalty.
|
| 39 |
+
- Connectivity gain term:
|
| 40 |
+
- rewards bridge edges that connect previously disconnected graph regions.
|
| 41 |
+
|
| 42 |
+
### 2) Final answer reward
|
| 43 |
+
|
| 44 |
+
For `ANSWER`, the reward combines:
|
| 45 |
+
|
| 46 |
+
- format validity,
|
| 47 |
+
- answer correctness,
|
| 48 |
+
- knowledge-carrying utility (AutoGraph-R1 style):
|
| 49 |
+
- $R_C(q, y, G) = \mathbb{{I}}[\text{{deducible}}(q, y \mid G)]$.
|
| 50 |
+
- knowledge-indexing utility (AutoGraph-R1 style):
|
| 51 |
+
- $R_I(q, D_{{gold}}, G) = |Top\text{{-}}k(G,q) \cap D_{{gold}}| / |D_{{gold}}|$,
|
| 52 |
+
- approximated in this environment with evidence recall over tool outputs.
|
| 53 |
+
- connectivity (UniRel style):
|
| 54 |
+
- discrete connectivity reward over extracted seed entities, normalized for stable mixing.
|
| 55 |
+
- graph F1 against supporting edges,
|
| 56 |
+
- compactness penalty for unnecessary extra edges,
|
| 57 |
+
- efficiency bonus,
|
| 58 |
+
- relation/entity informativeness for the constructed subgraph,
|
| 59 |
+
- repetition penalty to discourage redundant relation generation patterns.
|
| 60 |
+
|
| 61 |
+
UniRel-style aggregate view represented in this branch:
|
| 62 |
+
|
| 63 |
+
$$
|
| 64 |
+
R(a) \approx R_{{fmt}} + R_{{con}} + w_1 R_{{ent}} + w_2 R_{{rel}} + \text{{task utility terms}}
|
| 65 |
+
$$
|
| 66 |
+
|
| 67 |
+
with task utility terms coming from AutoGraph-inspired $R_C$ and $R_I$ components.
|
| 68 |
+
|
| 69 |
+
## Telemetry
|
| 70 |
+
|
| 71 |
+
Per-step component rewards are aggregated into `info["reward_components"]`, enabling:
|
| 72 |
+
|
| 73 |
+
- richer benchmark summaries,
|
| 74 |
+
- leaderboard ranking by composite utility,
|
| 75 |
+
- visual diagnostics in dashboard exports.
|
| 76 |
+
|
| 77 |
+
Evaluation also computes derived retrieval and structural utility signals used in leaderboard ranking.
|
| 78 |
+
|
| 79 |
+
## Future Multi-Agent Notes
|
| 80 |
+
|
| 81 |
+
This branch now includes a low-width swarm baseline orchestrator that adds PARL-style auxiliary shaping on top of the core edge and answer rewards.
|
| 82 |
+
|
| 83 |
+
The helper implementation is in:
|
| 84 |
+
|
| 85 |
+
- `src/osint_env/env/spawn_reward_hooks.py`
|
| 86 |
+
|
| 87 |
+
It follows the Kimi K2.5 style decomposition:
|
| 88 |
+
|
| 89 |
+
- $r_{{PARL}}(x,y) = r_{{perf}}(x,y) + \lambda_1 r_{{parallel}} + \lambda_2 r_{{finish}}$,
|
| 90 |
+
- optional critical-steps shaping for latency-sensitive training,
|
| 91 |
+
- optional annealing of $\lambda_1, \lambda_2$ toward zero,
|
| 92 |
+
- optional breadth/depth shaping hooks for future branch integration.
|
| 93 |
+
|
| 94 |
+
The expanded project-level walkthrough is in `README.md` under "Reward Design (Integrated Notes)".
|
inference.py
ADDED
|
@@ -0,0 +1,540 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from osint_env.agents.single_agent import SingleAgentRunner
|
| 9 |
+
from osint_env.agents.swarm_agent import SwarmAgentRunner
|
| 10 |
+
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 11 |
+
from osint_env.domain.models import EnvironmentConfig
|
| 12 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 13 |
+
from osint_env.env.reward import compute_graph_f1
|
| 14 |
+
from osint_env.eval.leaderboard import append_leaderboard_record, load_leaderboard
|
| 15 |
+
from osint_env.eval.metrics import EvalMetrics
|
| 16 |
+
from osint_env.llm import build_llm_client
|
| 17 |
+
from osint_env.viz import export_dashboard
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
CONFIG_PATH = os.getenv("CONFIG_PATH", "datasets/fixed_levels/shared_config_fixed_levels.json")
|
| 21 |
+
SEED_FILE = os.getenv("SEED_FILE", "datasets/fixed_levels/seed_fixed_levels.json")
|
| 22 |
+
AGENT_MODE = os.getenv("AGENT_MODE", "swarm")
|
| 23 |
+
LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai")
|
| 24 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-5.4")
|
| 25 |
+
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "")
|
| 26 |
+
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "")
|
| 27 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 28 |
+
OPENAI_API_KEY_ENV = os.getenv("OPENAI_API_KEY_ENV", "OPENAI_API_KEY")
|
| 29 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 30 |
+
API_KEY = os.getenv("API_KEY", "")
|
| 31 |
+
HF_SPACE_URL = os.getenv("HF_SPACE_URL", "")
|
| 32 |
+
HF_TOKEN = os.getenv("HF_TOKEN","")
|
| 33 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME", "")
|
| 34 |
+
LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "0"))
|
| 35 |
+
EPISODES = int(os.getenv("EPISODES", "1"))
|
| 36 |
+
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.67"))
|
| 37 |
+
TASK_INDICES_RAW = os.getenv("TASK_INDICES", "")
|
| 38 |
+
DATASET_MODE = os.getenv("DATASET_MODE", "")
|
| 39 |
+
METAQA_ROOT = os.getenv("METAQA_ROOT", "")
|
| 40 |
+
METAQA_KB_PATH = os.getenv("METAQA_KB_PATH", "")
|
| 41 |
+
METAQA_VARIANT = os.getenv("METAQA_VARIANT", "")
|
| 42 |
+
METAQA_HOPS_RAW = os.getenv("METAQA_HOPS", "")
|
| 43 |
+
METAQA_SPLITS_RAW = os.getenv("METAQA_SPLITS", "")
|
| 44 |
+
|
| 45 |
+
WRITE_BENCHMARK_ARTIFACTS = os.getenv("WRITE_BENCHMARK_ARTIFACTS", "1").strip().lower() in {
|
| 46 |
+
"1",
|
| 47 |
+
"true",
|
| 48 |
+
"yes",
|
| 49 |
+
"y",
|
| 50 |
+
"on",
|
| 51 |
+
}
|
| 52 |
+
LEADERBOARD_PATH = os.getenv("LEADERBOARD_PATH", "datasets/fixed_levels/leaderboard_fixed_levels.json")
|
| 53 |
+
DASHBOARD_PATH = os.getenv("DASHBOARD_PATH", "datasets/fixed_levels/dashboard_fixed_levels.html")
|
| 54 |
+
RUN_NAME = os.getenv("RUN_NAME", "fixed_levels_qwen_swarm")
|
| 55 |
+
|
| 56 |
+
BENCHMARK = "osint-openenv"
|
| 57 |
+
TASK_NAME = "fixed_levels_easy_mid_hard"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _parse_task_indices(raw: str) -> list[int]:
|
| 61 |
+
out: list[int] = []
|
| 62 |
+
for token in str(raw or "").split(","):
|
| 63 |
+
stripped = token.strip()
|
| 64 |
+
if not stripped:
|
| 65 |
+
continue
|
| 66 |
+
try:
|
| 67 |
+
out.append(int(stripped))
|
| 68 |
+
except ValueError:
|
| 69 |
+
continue
|
| 70 |
+
return out
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _parse_csv_tokens(raw: str) -> list[str]:
|
| 74 |
+
return [token.strip() for token in str(raw or "").split(",") if token.strip()]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _normalize_ollama_base_url(url: str) -> str:
|
| 78 |
+
normalized = str(url or "").strip().rstrip("/")
|
| 79 |
+
if normalized.endswith("/v1"):
|
| 80 |
+
normalized = normalized[:-3].rstrip("/")
|
| 81 |
+
return normalized or "http://127.0.0.1:11434"
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _normalize_openai_base_url(url: str) -> str:
|
| 85 |
+
normalized = str(url or "").strip().rstrip("/")
|
| 86 |
+
if not normalized:
|
| 87 |
+
return ""
|
| 88 |
+
if normalized.endswith("/v1"):
|
| 89 |
+
return normalized
|
| 90 |
+
return f"{normalized}/v1"
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
TASK_INDICES = _parse_task_indices(TASK_INDICES_RAW)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 97 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
|
| 101 |
+
error_text = "null" if error is None else str(error)
|
| 102 |
+
print(
|
| 103 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={str(bool(done)).lower()} error={error_text}",
|
| 104 |
+
flush=True,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def log_end(task: str, success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 109 |
+
rewards_text = ",".join(f"{value:.2f}" for value in rewards)
|
| 110 |
+
print(
|
| 111 |
+
f"[END] success={str(bool(success)).lower()} steps={steps} score={score:.2f} rewards={rewards_text}",
|
| 112 |
+
flush=True,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _looks_like_placeholder_api_key(value: str) -> bool:
|
| 117 |
+
token = str(value or "").strip().lower()
|
| 118 |
+
if not token:
|
| 119 |
+
return True
|
| 120 |
+
placeholder_markers = [
|
| 121 |
+
"your_openai_api_key",
|
| 122 |
+
"your-key",
|
| 123 |
+
"your_key",
|
| 124 |
+
"your real",
|
| 125 |
+
"real-openai-key",
|
| 126 |
+
"replace-me",
|
| 127 |
+
"changeme",
|
| 128 |
+
"example",
|
| 129 |
+
"<api-key>",
|
| 130 |
+
]
|
| 131 |
+
if token.startswith("your_") or token.startswith("sk-your-"):
|
| 132 |
+
return True
|
| 133 |
+
return any(marker in token for marker in placeholder_markers)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _format_action(action: dict[str, Any]) -> str:
|
| 137 |
+
action_type = str(action.get("action_type", "")).upper()
|
| 138 |
+
payload = dict(action.get("payload", {}))
|
| 139 |
+
|
| 140 |
+
if action_type == "ANSWER":
|
| 141 |
+
return f"answer({str(payload.get('answer', 'unknown')).strip()})"
|
| 142 |
+
|
| 143 |
+
if action_type == "ADD_EDGE":
|
| 144 |
+
try:
|
| 145 |
+
conf = float(payload.get("confidence", 1.0))
|
| 146 |
+
except (TypeError, ValueError):
|
| 147 |
+
conf = 1.0
|
| 148 |
+
return (
|
| 149 |
+
"add_edge("
|
| 150 |
+
f"{payload.get('src', '')},"
|
| 151 |
+
f"{payload.get('rel', '')},"
|
| 152 |
+
f"{payload.get('dst', '')},"
|
| 153 |
+
f"{conf:.2f}"
|
| 154 |
+
")"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
tool_name = str(payload.get("tool_name", "tool")).strip() or "tool"
|
| 158 |
+
args = payload.get("args", {})
|
| 159 |
+
if not isinstance(args, dict) or not args:
|
| 160 |
+
return f"{tool_name}()"
|
| 161 |
+
args_text = ",".join(f"{key}={value}" for key, value in sorted(args.items()))
|
| 162 |
+
return f"{tool_name}({args_text})"
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _assistant_tool_call_id(message: dict[str, Any]) -> str | None:
|
| 166 |
+
tool_calls = list(message.get("tool_calls", []))
|
| 167 |
+
if not tool_calls:
|
| 168 |
+
return None
|
| 169 |
+
tool_call_id = tool_calls[0].get("id")
|
| 170 |
+
return str(tool_call_id) if tool_call_id else None
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def _tool_result_message(assistant_message: dict[str, Any], result: dict[str, Any]) -> dict[str, Any] | None:
|
| 174 |
+
tool_call_id = _assistant_tool_call_id(assistant_message)
|
| 175 |
+
if not tool_call_id:
|
| 176 |
+
return None
|
| 177 |
+
return {
|
| 178 |
+
"role": "tool",
|
| 179 |
+
"tool_call_id": tool_call_id,
|
| 180 |
+
"content": json.dumps(result, sort_keys=True),
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def _resolve_environment_config() -> EnvironmentConfig:
|
| 185 |
+
shared = load_shared_config(CONFIG_PATH)
|
| 186 |
+
env_cfg = clone_environment_config(shared.environment)
|
| 187 |
+
|
| 188 |
+
if SEED_FILE and Path(SEED_FILE).exists():
|
| 189 |
+
env_cfg.seeding = load_seeding_config(SEED_FILE)
|
| 190 |
+
|
| 191 |
+
mode = AGENT_MODE.strip().lower()
|
| 192 |
+
if mode == "single":
|
| 193 |
+
env_cfg.swarm.enabled = False
|
| 194 |
+
elif mode == "swarm":
|
| 195 |
+
env_cfg.swarm.enabled = True
|
| 196 |
+
|
| 197 |
+
# Inference submissions must route all calls through OpenAI-compatible client config.
|
| 198 |
+
env_cfg.llm.provider = "openai"
|
| 199 |
+
env_cfg.llm.model = MODEL_NAME.strip()
|
| 200 |
+
|
| 201 |
+
if LLM_TIMEOUT_SECONDS > 0:
|
| 202 |
+
env_cfg.llm.timeout_seconds = int(LLM_TIMEOUT_SECONDS)
|
| 203 |
+
|
| 204 |
+
# Evaluation harnesses inject API_BASE_URL/HF_TOKEN for proxy-enforced requests.
|
| 205 |
+
resolved_openai_base = API_BASE_URL.strip() or OPENAI_BASE_URL.strip() or HF_SPACE_URL.strip()
|
| 206 |
+
if resolved_openai_base:
|
| 207 |
+
env_cfg.llm.openai_base_url = _normalize_openai_base_url(resolved_openai_base)
|
| 208 |
+
|
| 209 |
+
if HF_TOKEN.strip():
|
| 210 |
+
env_cfg.llm.openai_api_key = HF_TOKEN.strip()
|
| 211 |
+
elif API_KEY.strip():
|
| 212 |
+
env_cfg.llm.openai_api_key = API_KEY.strip()
|
| 213 |
+
elif OPENAI_API_KEY.strip():
|
| 214 |
+
env_cfg.llm.openai_api_key = OPENAI_API_KEY.strip()
|
| 215 |
+
|
| 216 |
+
if OPENAI_API_KEY_ENV.strip():
|
| 217 |
+
env_cfg.llm.openai_api_key_env = OPENAI_API_KEY_ENV.strip()
|
| 218 |
+
|
| 219 |
+
dataset_mode = DATASET_MODE.strip().lower()
|
| 220 |
+
if dataset_mode in {"canonical", "metaqa"}:
|
| 221 |
+
env_cfg.dataset_mode = dataset_mode
|
| 222 |
+
|
| 223 |
+
if METAQA_ROOT.strip():
|
| 224 |
+
env_cfg.metaqa_root = METAQA_ROOT.strip()
|
| 225 |
+
if METAQA_KB_PATH.strip():
|
| 226 |
+
env_cfg.metaqa_kb_path = METAQA_KB_PATH.strip()
|
| 227 |
+
|
| 228 |
+
metaqa_variant = METAQA_VARIANT.strip().lower()
|
| 229 |
+
if metaqa_variant in {"vanilla", "ntm"}:
|
| 230 |
+
env_cfg.metaqa_variant = metaqa_variant
|
| 231 |
+
|
| 232 |
+
metaqa_hops = _parse_csv_tokens(METAQA_HOPS_RAW)
|
| 233 |
+
if metaqa_hops:
|
| 234 |
+
env_cfg.metaqa_hops = metaqa_hops
|
| 235 |
+
|
| 236 |
+
metaqa_splits = _parse_csv_tokens(METAQA_SPLITS_RAW)
|
| 237 |
+
if metaqa_splits:
|
| 238 |
+
env_cfg.metaqa_splits = metaqa_splits
|
| 239 |
+
|
| 240 |
+
return env_cfg
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def _runner_for(env: OSINTEnvironment, llm: Any) -> SingleAgentRunner | SwarmAgentRunner:
|
| 244 |
+
if env.config.swarm.enabled:
|
| 245 |
+
return SwarmAgentRunner(env=env, llm=llm)
|
| 246 |
+
return SingleAgentRunner(env=env, llm=llm)
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def _normalize_difficulty(value: str) -> str:
|
| 250 |
+
token = str(value or "").strip().lower()
|
| 251 |
+
if token in {"easy", "e"}:
|
| 252 |
+
return "easy"
|
| 253 |
+
if token in {"mid", "medium", "m"}:
|
| 254 |
+
return "medium"
|
| 255 |
+
if token in {"high", "hard", "h"}:
|
| 256 |
+
return "hard"
|
| 257 |
+
return "hard"
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def _task_difficulty(env: OSINTEnvironment, task_index: int) -> str:
|
| 261 |
+
idx = int(task_index) % max(1, len(env.tasks))
|
| 262 |
+
task = env.tasks[idx]
|
| 263 |
+
if isinstance(task.metadata, dict) and "difficulty" in task.metadata:
|
| 264 |
+
return _normalize_difficulty(str(task.metadata.get("difficulty", "")))
|
| 265 |
+
if idx < 10:
|
| 266 |
+
return "easy"
|
| 267 |
+
if idx < 20:
|
| 268 |
+
return "medium"
|
| 269 |
+
return "hard"
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def _episode_row(env: OSINTEnvironment, info: dict[str, Any]) -> dict[str, Any]:
|
| 273 |
+
if env.state is None:
|
| 274 |
+
return {
|
| 275 |
+
"task_id": "unknown",
|
| 276 |
+
"task_type": "unknown",
|
| 277 |
+
"question": "",
|
| 278 |
+
"task_answer": str(info.get("task_answer", "")),
|
| 279 |
+
"agent_answer": str(info.get("agent_answer", "")),
|
| 280 |
+
"graph_f1": 0.0,
|
| 281 |
+
"reward": float(info.get("total_reward", 0.0) or 0.0),
|
| 282 |
+
"steps": int(info.get("step_count", 0) or 0),
|
| 283 |
+
"tool_calls": int(info.get("tool_calls", 0) or 0),
|
| 284 |
+
"success": int(info.get("agent_answer") == info.get("task_answer")),
|
| 285 |
+
"reward_components": dict(info.get("reward_components", {})),
|
| 286 |
+
"pred_edges": [],
|
| 287 |
+
"truth_edges": [],
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
graph_f1 = compute_graph_f1(env.memory_graph.edges, env.state.task.supporting_edges)
|
| 291 |
+
return {
|
| 292 |
+
"task_id": env.state.task.task_id,
|
| 293 |
+
"task_type": env.state.task.task_type,
|
| 294 |
+
"question": env.state.task.question,
|
| 295 |
+
"task_answer": str(info.get("task_answer", "")),
|
| 296 |
+
"agent_answer": str(info.get("agent_answer", "")) if info.get("agent_answer") is not None else "",
|
| 297 |
+
"graph_f1": graph_f1,
|
| 298 |
+
"reward": float(info.get("total_reward", 0.0) or 0.0),
|
| 299 |
+
"steps": int(info.get("step_count", 0) or 0),
|
| 300 |
+
"tool_calls": int(info.get("tool_calls", 0) or 0),
|
| 301 |
+
"success": int(info.get("agent_answer") == info.get("task_answer")),
|
| 302 |
+
"reward_components": dict(info.get("reward_components", {})),
|
| 303 |
+
"spawn_count": int(info.get("spawn_count", 0) or 0),
|
| 304 |
+
"spawn_critical_steps": int(info.get("spawn_critical_steps", 0) or 0),
|
| 305 |
+
"pred_edges": [
|
| 306 |
+
{
|
| 307 |
+
"src": edge.src,
|
| 308 |
+
"rel": edge.rel,
|
| 309 |
+
"dst": edge.dst,
|
| 310 |
+
"confidence": float(edge.confidence),
|
| 311 |
+
}
|
| 312 |
+
for edge in env.memory_graph.edges
|
| 313 |
+
],
|
| 314 |
+
"truth_edges": [
|
| 315 |
+
{
|
| 316 |
+
"src": edge.src,
|
| 317 |
+
"rel": edge.rel,
|
| 318 |
+
"dst": edge.dst,
|
| 319 |
+
"confidence": float(edge.confidence),
|
| 320 |
+
}
|
| 321 |
+
for edge in env.state.task.supporting_edges
|
| 322 |
+
],
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def _last_action_error(observation: Any, info: dict[str, Any]) -> str | None:
|
| 327 |
+
raw = info.get("last_action_error") if isinstance(info, dict) else None
|
| 328 |
+
if raw is not None:
|
| 329 |
+
return str(raw)
|
| 330 |
+
|
| 331 |
+
tool_outputs = getattr(observation, "tool_outputs", None)
|
| 332 |
+
if isinstance(tool_outputs, list) and tool_outputs:
|
| 333 |
+
last = tool_outputs[-1]
|
| 334 |
+
if isinstance(last, dict):
|
| 335 |
+
output = last.get("output")
|
| 336 |
+
if isinstance(output, dict) and output.get("error") is not None:
|
| 337 |
+
return str(output.get("error"))
|
| 338 |
+
return None
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def _install_step_logger(env: OSINTEnvironment) -> tuple[list[float], dict[str, int], Any]:
|
| 342 |
+
rewards: list[float] = []
|
| 343 |
+
counters = {"steps": 0}
|
| 344 |
+
original_step = env.step
|
| 345 |
+
|
| 346 |
+
def _logged_step(action: Any):
|
| 347 |
+
observation, reward, done, info = original_step(action)
|
| 348 |
+
counters["steps"] += 1
|
| 349 |
+
reward_value = float(reward or 0.0)
|
| 350 |
+
rewards.append(reward_value)
|
| 351 |
+
action_type = getattr(action, "action_type", "")
|
| 352 |
+
action_type_value = str(getattr(action_type, "value", action_type))
|
| 353 |
+
action_text = _format_action(
|
| 354 |
+
{
|
| 355 |
+
"action_type": action_type_value,
|
| 356 |
+
"payload": dict(getattr(action, "payload", {}) or {}),
|
| 357 |
+
}
|
| 358 |
+
)
|
| 359 |
+
log_step(
|
| 360 |
+
step=counters["steps"],
|
| 361 |
+
action=action_text,
|
| 362 |
+
reward=reward_value,
|
| 363 |
+
done=bool(done),
|
| 364 |
+
error=_last_action_error(observation, info if isinstance(info, dict) else {}),
|
| 365 |
+
)
|
| 366 |
+
return observation, reward, done, info
|
| 367 |
+
|
| 368 |
+
env.step = _logged_step
|
| 369 |
+
return rewards, counters, original_step
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def _validate_required_configuration() -> None:
|
| 373 |
+
missing: list[str] = []
|
| 374 |
+
|
| 375 |
+
api_base = API_BASE_URL.strip()
|
| 376 |
+
model_name = MODEL_NAME.strip()
|
| 377 |
+
hf_token = HF_TOKEN.strip()
|
| 378 |
+
api_key = API_KEY.strip()
|
| 379 |
+
openai_key = OPENAI_API_KEY.strip()
|
| 380 |
+
|
| 381 |
+
if not api_base or api_base == "<your-active-endpoint>":
|
| 382 |
+
missing.append("API_BASE_URL")
|
| 383 |
+
if not model_name or model_name == "<your-active-model>":
|
| 384 |
+
missing.append("MODEL_NAME")
|
| 385 |
+
if not (hf_token or api_key or openai_key):
|
| 386 |
+
missing.append("HF_TOKEN|API_KEY|OPENAI_API_KEY")
|
| 387 |
+
|
| 388 |
+
# Required when using docker-image based env construction.
|
| 389 |
+
if os.getenv("REQUIRE_LOCAL_IMAGE_NAME", "0").strip().lower() in {"1", "true", "yes", "on"}:
|
| 390 |
+
if not LOCAL_IMAGE_NAME.strip():
|
| 391 |
+
missing.append("LOCAL_IMAGE_NAME")
|
| 392 |
+
|
| 393 |
+
if missing:
|
| 394 |
+
raise RuntimeError(f"Missing required environment variables: {', '.join(sorted(set(missing)))}")
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
def _task_targets(env: OSINTEnvironment, episodes: int, task_indices: list[int]) -> list[int | None]:
|
| 398 |
+
if task_indices:
|
| 399 |
+
task_count = max(1, len(env.tasks))
|
| 400 |
+
return [index % task_count for index in task_indices]
|
| 401 |
+
return [None] * max(1, episodes)
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def _run_with_runner(
|
| 405 |
+
env: OSINTEnvironment,
|
| 406 |
+
llm: Any,
|
| 407 |
+
episodes: int,
|
| 408 |
+
task_indices: list[int],
|
| 409 |
+
) -> tuple[dict[str, Any], list[dict[str, Any]], list[float], int]:
|
| 410 |
+
metrics = EvalMetrics()
|
| 411 |
+
episode_rows: list[dict[str, Any]] = []
|
| 412 |
+
rewards, counters, original_step = _install_step_logger(env)
|
| 413 |
+
|
| 414 |
+
single_runner = SingleAgentRunner(env=env, llm=llm)
|
| 415 |
+
swarm_runner = SwarmAgentRunner(env=env, llm=llm) if env.config.swarm.enabled else None
|
| 416 |
+
|
| 417 |
+
try:
|
| 418 |
+
for task_index in _task_targets(env, episodes, task_indices):
|
| 419 |
+
task_count = max(1, len(env.tasks))
|
| 420 |
+
selected_index = env._task_idx % task_count if task_index is None else int(task_index) % task_count
|
| 421 |
+
if task_index is not None:
|
| 422 |
+
# Keep compatibility with explicit task selection from the previous inference script.
|
| 423 |
+
env._task_idx = selected_index
|
| 424 |
+
|
| 425 |
+
difficulty = _task_difficulty(env, selected_index)
|
| 426 |
+
if difficulty == "easy":
|
| 427 |
+
runner: SingleAgentRunner | SwarmAgentRunner = single_runner
|
| 428 |
+
elif swarm_runner is not None:
|
| 429 |
+
runner = swarm_runner
|
| 430 |
+
else:
|
| 431 |
+
runner = single_runner
|
| 432 |
+
|
| 433 |
+
info = runner.run_episode()
|
| 434 |
+
if env.state is None:
|
| 435 |
+
continue
|
| 436 |
+
|
| 437 |
+
graph_f1 = compute_graph_f1(env.memory_graph.edges, env.state.task.supporting_edges)
|
| 438 |
+
metrics.add(info, task_type=env.state.task.task_type, graph_f1=graph_f1)
|
| 439 |
+
episode_rows.append(_episode_row(env, info))
|
| 440 |
+
finally:
|
| 441 |
+
env.step = original_step
|
| 442 |
+
|
| 443 |
+
return metrics.summary(), episode_rows, rewards, int(counters["steps"])
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def _maybe_write_artifacts(
|
| 447 |
+
env: OSINTEnvironment,
|
| 448 |
+
summary: dict[str, Any],
|
| 449 |
+
episodes: int,
|
| 450 |
+
episode_rows: list[dict[str, Any]],
|
| 451 |
+
) -> tuple[dict[str, Any] | None, str | None]:
|
| 452 |
+
if not WRITE_BENCHMARK_ARTIFACTS:
|
| 453 |
+
return None, None
|
| 454 |
+
|
| 455 |
+
record = append_leaderboard_record(
|
| 456 |
+
path=LEADERBOARD_PATH,
|
| 457 |
+
summary=summary,
|
| 458 |
+
episodes=episodes,
|
| 459 |
+
run_name=RUN_NAME or None,
|
| 460 |
+
config={
|
| 461 |
+
"seed": env.config.seed,
|
| 462 |
+
"max_steps": env.config.max_steps,
|
| 463 |
+
"swarm_enabled": env.config.swarm.enabled,
|
| 464 |
+
"max_agents": env.config.swarm.max_agents,
|
| 465 |
+
"max_breadth": env.config.swarm.max_breadth,
|
| 466 |
+
"max_width": env.config.swarm.max_width,
|
| 467 |
+
"max_depth": env.config.swarm.max_depth,
|
| 468 |
+
"seeded_questions": len(env.config.seeding.seeded_questions),
|
| 469 |
+
"llm_provider": env.config.llm.provider,
|
| 470 |
+
"llm_model": env.config.llm.model,
|
| 471 |
+
},
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
leaderboard = load_leaderboard(LEADERBOARD_PATH)
|
| 475 |
+
dashboard = export_dashboard(
|
| 476 |
+
env=env,
|
| 477 |
+
evaluation={"summary": summary, "episodes": episode_rows},
|
| 478 |
+
leaderboard_records=leaderboard,
|
| 479 |
+
output_path=DASHBOARD_PATH,
|
| 480 |
+
)
|
| 481 |
+
return record, dashboard
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
def main() -> None:
|
| 485 |
+
_validate_required_configuration()
|
| 486 |
+
env_cfg = _resolve_environment_config()
|
| 487 |
+
llm_client = build_llm_client(env_cfg.llm)
|
| 488 |
+
|
| 489 |
+
episodes_given = "EPISODES" in os.environ and str(os.getenv("EPISODES", "")).strip() != ""
|
| 490 |
+
task_indices_given = bool(TASK_INDICES)
|
| 491 |
+
|
| 492 |
+
if not episodes_given and not task_indices_given:
|
| 493 |
+
runs: list[tuple[str, list[int], int]] = [
|
| 494 |
+
("easy", list(range(0, 10)), 10),
|
| 495 |
+
("mid", list(range(10, 20)), 10),
|
| 496 |
+
("hard", list(range(20, 30)), 10),
|
| 497 |
+
]
|
| 498 |
+
else:
|
| 499 |
+
selected_indices = TASK_INDICES if task_indices_given else []
|
| 500 |
+
episodes = len(selected_indices) if selected_indices else max(1, EPISODES)
|
| 501 |
+
runs = [(TASK_NAME, selected_indices, episodes)]
|
| 502 |
+
|
| 503 |
+
for task_name, run_indices, run_episodes in runs:
|
| 504 |
+
env: OSINTEnvironment | None = None
|
| 505 |
+
rewards: list[float] = []
|
| 506 |
+
steps_taken = 0
|
| 507 |
+
score = 0.0
|
| 508 |
+
success = False
|
| 509 |
+
|
| 510 |
+
env = OSINTEnvironment(env_cfg, llm=llm_client)
|
| 511 |
+
log_start(task=task_name, env=BENCHMARK, model=env_cfg.llm.model)
|
| 512 |
+
|
| 513 |
+
try:
|
| 514 |
+
summary, episode_rows, rewards, steps_taken = _run_with_runner(
|
| 515 |
+
env=env,
|
| 516 |
+
llm=llm_client,
|
| 517 |
+
episodes=run_episodes,
|
| 518 |
+
task_indices=run_indices,
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
score = float(summary.get("avg_reward", 0.0) or 0.0)
|
| 522 |
+
score = max(0.0, min(1.0, score))
|
| 523 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 524 |
+
|
| 525 |
+
_maybe_write_artifacts(
|
| 526 |
+
env=env,
|
| 527 |
+
summary=summary,
|
| 528 |
+
episodes=run_episodes,
|
| 529 |
+
episode_rows=episode_rows,
|
| 530 |
+
)
|
| 531 |
+
finally:
|
| 532 |
+
if env is not None:
|
| 533 |
+
close_fn = getattr(env, "close", None)
|
| 534 |
+
if callable(close_fn):
|
| 535 |
+
close_fn()
|
| 536 |
+
log_end(task=task_name, success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
if __name__ == "__main__":
|
| 540 |
+
main()
|
my_env_v4.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@dataclass(slots=True)
|
| 7 |
+
class MyEnvV4Action:
|
| 8 |
+
message: str
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass(slots=True)
|
| 12 |
+
class _EchoObservation:
|
| 13 |
+
echoed_message: str
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass(slots=True)
|
| 17 |
+
class _EchoResult:
|
| 18 |
+
observation: _EchoObservation
|
| 19 |
+
reward: float = 0.0
|
| 20 |
+
done: bool = False
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class MyEnvV4Env:
|
| 24 |
+
def __init__(self) -> None:
|
| 25 |
+
self._step_count = 0
|
| 26 |
+
|
| 27 |
+
@classmethod
|
| 28 |
+
async def from_docker_image(cls, image_name: str | None = None) -> "MyEnvV4Env":
|
| 29 |
+
return cls()
|
| 30 |
+
|
| 31 |
+
async def reset(self) -> _EchoResult:
|
| 32 |
+
self._step_count = 0
|
| 33 |
+
return _EchoResult(observation=_EchoObservation(echoed_message=""), reward=0.0, done=False)
|
| 34 |
+
|
| 35 |
+
async def step(self, action: MyEnvV4Action) -> _EchoResult:
|
| 36 |
+
self._step_count += 1
|
| 37 |
+
message = str(getattr(action, "message", ""))
|
| 38 |
+
reward = len(message) * 0.1
|
| 39 |
+
return _EchoResult(
|
| 40 |
+
observation=_EchoObservation(echoed_message=message),
|
| 41 |
+
reward=reward,
|
| 42 |
+
done=False,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
async def close(self) -> None:
|
| 46 |
+
return None
|
openenv.yaml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: osint-openenv
|
| 2 |
+
version: 0.1.0
|
| 3 |
+
description: Synthetic OSINT benchmark environment exposed over HTTP.
|
| 4 |
+
tasks:
|
| 5 |
+
- id: seed_task_0
|
| 6 |
+
difficulty: easy
|
| 7 |
+
max_steps: 24
|
| 8 |
+
grader:
|
| 9 |
+
type: difficulty_exact_match
|
| 10 |
+
answer_type: node_id
|
| 11 |
+
case_sensitive: true
|
| 12 |
+
reward_profile: easy
|
| 13 |
+
- id: seed_task_10
|
| 14 |
+
difficulty: medium
|
| 15 |
+
max_steps: 24
|
| 16 |
+
grader:
|
| 17 |
+
type: difficulty_exact_match
|
| 18 |
+
answer_type: node_id
|
| 19 |
+
case_sensitive: true
|
| 20 |
+
reward_profile: medium
|
| 21 |
+
- id: seed_task_20
|
| 22 |
+
difficulty: hard
|
| 23 |
+
max_steps: 24
|
| 24 |
+
grader:
|
| 25 |
+
type: difficulty_exact_match
|
| 26 |
+
answer_type: node_id
|
| 27 |
+
case_sensitive: true
|
| 28 |
+
reward_profile: hard
|
| 29 |
+
transport:
|
| 30 |
+
type: http
|
| 31 |
+
base_path: /
|
| 32 |
+
endpoints:
|
| 33 |
+
health:
|
| 34 |
+
method: GET
|
| 35 |
+
path: /health
|
| 36 |
+
metadata:
|
| 37 |
+
method: GET
|
| 38 |
+
path: /api/environment
|
| 39 |
+
tasks:
|
| 40 |
+
method: GET
|
| 41 |
+
path: /openenv/tasks
|
| 42 |
+
reset:
|
| 43 |
+
method: POST
|
| 44 |
+
path: /reset
|
| 45 |
+
step:
|
| 46 |
+
method: POST
|
| 47 |
+
path: /step
|
| 48 |
+
state:
|
| 49 |
+
method: GET
|
| 50 |
+
path: /state
|
| 51 |
+
models:
|
| 52 |
+
action_space:
|
| 53 |
+
- CALL_TOOL
|
| 54 |
+
- ADD_EDGE
|
| 55 |
+
- ANSWER
|
| 56 |
+
task_fields:
|
| 57 |
+
- task_id
|
| 58 |
+
- task_type
|
| 59 |
+
- question
|
| 60 |
+
- difficulty
|
| 61 |
+
- grader
|
| 62 |
+
observation_fields:
|
| 63 |
+
- tool_outputs
|
| 64 |
+
- graph_snapshot
|
| 65 |
+
- action_history
|
| 66 |
+
- task
|
pyproject.toml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "osint-rl-env"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "OSINT-style multi-platform information ecosystem environment for LLM agents."
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"openenv>=0.1.13",
|
| 9 |
+
"openai>=1.40.0",
|
| 10 |
+
"fastapi>=0.115.0",
|
| 11 |
+
"requests>=2.32.3",
|
| 12 |
+
"uvicorn>=0.30.0",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
[project.optional-dependencies]
|
| 16 |
+
dev = [
|
| 17 |
+
"pytest>=8.0.0",
|
| 18 |
+
]
|
| 19 |
+
train = [
|
| 20 |
+
"datasets>=2.20.0",
|
| 21 |
+
"transformers>=4.45.0",
|
| 22 |
+
"accelerate>=0.33.0",
|
| 23 |
+
"trl>=0.15.0",
|
| 24 |
+
"peft>=0.11.0",
|
| 25 |
+
"pillow",
|
| 26 |
+
"torchvision",
|
| 27 |
+
"wandb",
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
[project.scripts]
|
| 31 |
+
osint-env = "osint_env.cli:main"
|
| 32 |
+
server = "osint_env.server_entry:main"
|
| 33 |
+
|
| 34 |
+
[build-system]
|
| 35 |
+
requires = ["setuptools>=68", "wheel"]
|
| 36 |
+
build-backend = "setuptools.build_meta"
|
| 37 |
+
|
| 38 |
+
[tool.setuptools]
|
| 39 |
+
package-dir = {"" = "src"}
|
| 40 |
+
|
| 41 |
+
[tool.setuptools.packages.find]
|
| 42 |
+
where = ["src"]
|
| 43 |
+
|
| 44 |
+
[tool.pytest.ini_options]
|
| 45 |
+
testpaths = ["tests"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv>=0.1.13
|
| 2 |
+
openai>=1.40.0
|
| 3 |
+
fastapi>=0.115.0
|
| 4 |
+
requests>=2.32.3
|
| 5 |
+
uvicorn>=0.30.0
|
| 6 |
+
pytest>=8.0.0
|
| 7 |
+
datasets>=2.20.0
|
| 8 |
+
transformers>=4.45.0
|
| 9 |
+
accelerate>=0.33.0
|
| 10 |
+
trl>=0.15.0
|
| 11 |
+
peft>=0.11.0
|
scripts/build_fixed_levels_dataset.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from dataclasses import asdict
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 11 |
+
from osint_env.data.generator import DatasetGenerator
|
| 12 |
+
from osint_env.domain.models import Edge, TaskInstance
|
| 13 |
+
from osint_env.llm import build_llm_client
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def edge_to_dict(edge: Edge) -> dict[str, Any]:
|
| 17 |
+
return {
|
| 18 |
+
"src": edge.src,
|
| 19 |
+
"rel": edge.rel,
|
| 20 |
+
"dst": edge.dst,
|
| 21 |
+
"confidence": float(edge.confidence),
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def task_to_dict(task: TaskInstance) -> dict[str, Any]:
|
| 26 |
+
return {
|
| 27 |
+
"task_id": task.task_id,
|
| 28 |
+
"task_type": task.task_type,
|
| 29 |
+
"question": task.question,
|
| 30 |
+
"answer": task.answer,
|
| 31 |
+
"supporting_edges": [edge_to_dict(e) for e in task.supporting_edges],
|
| 32 |
+
"metadata": dict(task.metadata),
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def build_fixed_snapshot(seed_path: Path) -> dict[str, Any]:
|
| 37 |
+
seeding = load_seeding_config(seed_path)
|
| 38 |
+
fixed_nodes = []
|
| 39 |
+
for node in seeding.seeded_nodes:
|
| 40 |
+
fixed_nodes.append(
|
| 41 |
+
{
|
| 42 |
+
"node_id": node.node_id,
|
| 43 |
+
"node_type": str(getattr(node.node_type, "value", node.node_type)),
|
| 44 |
+
"attrs": dict(node.attrs),
|
| 45 |
+
}
|
| 46 |
+
)
|
| 47 |
+
fixed_edges = [
|
| 48 |
+
{
|
| 49 |
+
"src": edge.src,
|
| 50 |
+
"rel": edge.rel,
|
| 51 |
+
"dst": edge.dst,
|
| 52 |
+
"confidence": float(edge.confidence),
|
| 53 |
+
}
|
| 54 |
+
for edge in seeding.seeded_edges
|
| 55 |
+
]
|
| 56 |
+
fixed_questions = []
|
| 57 |
+
for idx, q in enumerate(seeding.seeded_questions):
|
| 58 |
+
fixed_questions.append(
|
| 59 |
+
{
|
| 60 |
+
"task_id": f"fixed_task_{idx:02d}",
|
| 61 |
+
"task_type": q.task_type,
|
| 62 |
+
"question": q.question,
|
| 63 |
+
"answer": q.answer,
|
| 64 |
+
"supporting_edges": [
|
| 65 |
+
{
|
| 66 |
+
"src": edge.src,
|
| 67 |
+
"rel": edge.rel,
|
| 68 |
+
"dst": edge.dst,
|
| 69 |
+
"confidence": float(edge.confidence),
|
| 70 |
+
}
|
| 71 |
+
for edge in q.supporting_edges
|
| 72 |
+
],
|
| 73 |
+
"metadata": dict(q.metadata),
|
| 74 |
+
}
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
difficulty_counts = Counter(str(q.get("metadata", {}).get("difficulty", "unknown")) for q in fixed_questions)
|
| 78 |
+
return {
|
| 79 |
+
"dataset_name": "fixed_levels_submission_set",
|
| 80 |
+
"source_seed": str(seed_path),
|
| 81 |
+
"graph": {
|
| 82 |
+
"nodes": fixed_nodes,
|
| 83 |
+
"edges": fixed_edges,
|
| 84 |
+
"node_count": len(fixed_nodes),
|
| 85 |
+
"edge_count": len(fixed_edges),
|
| 86 |
+
},
|
| 87 |
+
"questions": fixed_questions,
|
| 88 |
+
"question_count": len(fixed_questions),
|
| 89 |
+
"difficulty_counts": dict(difficulty_counts),
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def build_complete_snapshot(shared_config_path: Path, seed_path: Path) -> dict[str, Any]:
|
| 94 |
+
shared = load_shared_config(shared_config_path)
|
| 95 |
+
env_cfg = clone_environment_config(shared.environment)
|
| 96 |
+
env_cfg.seeding = load_seeding_config(seed_path)
|
| 97 |
+
|
| 98 |
+
llm_client = build_llm_client(env_cfg.llm)
|
| 99 |
+
generator = DatasetGenerator(config=env_cfg, llm=llm_client)
|
| 100 |
+
|
| 101 |
+
graph = generator.build_canonical_graph()
|
| 102 |
+
views = generator.build_platform_views(graph)
|
| 103 |
+
tasks = generator.generate_tasks(graph, views, count=max(15, len(env_cfg.seeding.seeded_questions)))
|
| 104 |
+
|
| 105 |
+
difficulty_counts = Counter(str(task.metadata.get("difficulty", "unknown")) for task in tasks)
|
| 106 |
+
|
| 107 |
+
return {
|
| 108 |
+
"dataset_name": "fixed_levels_submission_set",
|
| 109 |
+
"generation_mode": "llm_expanded",
|
| 110 |
+
"shared_config": str(shared_config_path),
|
| 111 |
+
"seed_file": str(seed_path),
|
| 112 |
+
"llm": asdict(env_cfg.llm),
|
| 113 |
+
"environment": {
|
| 114 |
+
"n_users": env_cfg.n_users,
|
| 115 |
+
"alias_density": env_cfg.alias_density,
|
| 116 |
+
"noise_level": env_cfg.noise_level,
|
| 117 |
+
"red_herring_rate": env_cfg.red_herring_rate,
|
| 118 |
+
"seed": env_cfg.seed,
|
| 119 |
+
},
|
| 120 |
+
"canonical_graph": {
|
| 121 |
+
"node_count": len(graph.nodes),
|
| 122 |
+
"edge_count": len(graph.edges),
|
| 123 |
+
"nodes": [
|
| 124 |
+
{
|
| 125 |
+
"node_id": node.node_id,
|
| 126 |
+
"node_type": node.node_type.value,
|
| 127 |
+
"attrs": dict(node.attrs),
|
| 128 |
+
}
|
| 129 |
+
for node in sorted(graph.nodes.values(), key=lambda n: n.node_id)
|
| 130 |
+
],
|
| 131 |
+
"edges": [edge_to_dict(edge) for edge in graph.edges],
|
| 132 |
+
},
|
| 133 |
+
"platform_views": {
|
| 134 |
+
"microblog_posts": views.microblog_posts,
|
| 135 |
+
"forum_threads": views.forum_threads,
|
| 136 |
+
"profiles": views.profiles,
|
| 137 |
+
"counts": {
|
| 138 |
+
"microblog_posts": len(views.microblog_posts),
|
| 139 |
+
"forum_threads": len(views.forum_threads),
|
| 140 |
+
"profiles": len(views.profiles),
|
| 141 |
+
},
|
| 142 |
+
},
|
| 143 |
+
"tasks": [task_to_dict(task) for task in tasks],
|
| 144 |
+
"task_count": len(tasks),
|
| 145 |
+
"difficulty_counts": dict(difficulty_counts),
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def main() -> None:
|
| 150 |
+
parser = argparse.ArgumentParser(description="Build fixed difficulty dataset artifacts.")
|
| 151 |
+
parser.add_argument(
|
| 152 |
+
"--seed-file",
|
| 153 |
+
default="datasets/fixed_levels/seed_fixed_levels.json",
|
| 154 |
+
help="Path to seeding JSON with fixed graph/questions.",
|
| 155 |
+
)
|
| 156 |
+
parser.add_argument(
|
| 157 |
+
"--shared-config",
|
| 158 |
+
default="datasets/fixed_levels/shared_config_fixed_levels.json",
|
| 159 |
+
help="Path to shared config used for LLM-expanded generation.",
|
| 160 |
+
)
|
| 161 |
+
parser.add_argument(
|
| 162 |
+
"--output-dir",
|
| 163 |
+
default="datasets/fixed_levels",
|
| 164 |
+
help="Directory where dataset artifacts are written.",
|
| 165 |
+
)
|
| 166 |
+
args = parser.parse_args()
|
| 167 |
+
|
| 168 |
+
output_dir = Path(args.output_dir)
|
| 169 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 170 |
+
|
| 171 |
+
seed_path = Path(args.seed_file)
|
| 172 |
+
shared_path = Path(args.shared_config)
|
| 173 |
+
|
| 174 |
+
fixed_snapshot = build_fixed_snapshot(seed_path)
|
| 175 |
+
fixed_path = output_dir / "fixed_graph_questions.json"
|
| 176 |
+
fixed_path.write_text(json.dumps(fixed_snapshot, indent=2, sort_keys=True), encoding="utf-8")
|
| 177 |
+
|
| 178 |
+
complete_snapshot = build_complete_snapshot(shared_path, seed_path)
|
| 179 |
+
complete_path = output_dir / "complete_dataset_qwen_generated.json"
|
| 180 |
+
complete_path.write_text(json.dumps(complete_snapshot, indent=2, sort_keys=True), encoding="utf-8")
|
| 181 |
+
|
| 182 |
+
summary = {
|
| 183 |
+
"fixed_dataset": str(fixed_path),
|
| 184 |
+
"complete_dataset": str(complete_path),
|
| 185 |
+
"fixed_nodes": fixed_snapshot["graph"]["node_count"],
|
| 186 |
+
"fixed_edges": fixed_snapshot["graph"]["edge_count"],
|
| 187 |
+
"fixed_questions": fixed_snapshot["question_count"],
|
| 188 |
+
"complete_nodes": complete_snapshot["canonical_graph"]["node_count"],
|
| 189 |
+
"complete_edges": complete_snapshot["canonical_graph"]["edge_count"],
|
| 190 |
+
"complete_tasks": complete_snapshot["task_count"],
|
| 191 |
+
"difficulty_counts": complete_snapshot["difficulty_counts"],
|
| 192 |
+
}
|
| 193 |
+
print(json.dumps(summary, indent=2, sort_keys=True))
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
if __name__ == "__main__":
|
| 197 |
+
main()
|
scripts/generate_fixed_levels_seed.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter, OrderedDict
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
U=[('aria','Aria Sen','Helios Labs','Sector 9'),('bharat','Bharat Kulkarni','Northbridge Logistics','Dockyard 17'),('cyrus','Cyrus Mehta','Apex Dynamics','Old Town'),('diya','Diya Roy','Blueharbor Media','Old Town'),('elin','Elin Das','Helios Labs','Sector 9'),('faris','Faris Noor','Tidewatch Ops','Rivergate'),('gita','Gita Pradhan','Apex Dynamics','Old Town'),('hiro','Hiro Tan','Northbridge Logistics','Dockyard 17'),('ivy','Ivy Kapoor','Kestrel Works','Rivergate'),('jules','Jules Banerjee','Blueharbor Media','Old Town'),('kian','Kian Bose','Atlas Freight','East Quay'),('leena','Leena Das','Sunmesh Analytics','Sector 9'),('mika','Mika Solanki','Orion Customs','North Basin'),('nora','Nora Iqbal','Emberline Security','Foundry Row'),('omar','Omar Sheikh','Atlas Freight','East Quay'),('priya','Priya Menon','Sunmesh Analytics','Sector 9'),('quinn','Quinn Rao','Orion Customs','North Basin'),('rhea','Rhea Kapoor','Emberline Security','Foundry Row'),('soren','Soren Malik','Harborlight Transit','Uplink Yard'),('tara','Tara Dey','Harborlight Transit','Uplink Yard')]
|
| 6 |
+
A=[('orchidfox','@orchidfox','ivy'),('steelquill','@steelquill','bharat'),('monsoonbyte','@monsoonbyte','diya'),('nightrelay','@nightrelay','faris'),('mapleghost','@mapleghost','elin'),('docksparrow','@docksparrow','hiro'),('quartzlotus','@quartzlotus','cyrus'),('emberglass','@emberglass','nora'),('basinraven','@basinraven','mika'),('tideshard','@tideshard','soren'),('hollowsignal','@hollowsignal','priya'),('ironwhisper','@ironwhisper','omar'),('cinderveil','@cinderveil','rhea'),('sablekeel','@sablekeel','tara'),('lanternmoth','@lanternmoth','kian'),('frostledger','@frostledger','leena')]
|
| 7 |
+
L=[('dockyard17','Dockyard 17'),('sector9','Sector 9'),('old_town','Old Town'),('rivergate','Rivergate'),('east_quay','East Quay'),('foundry_row','Foundry Row'),('north_basin','North Basin'),('uplink_yard','Uplink Yard')]
|
| 8 |
+
O=[('helios_labs','Helios Labs','sector9'),('northbridge_logistics','Northbridge Logistics','dockyard17'),('apex_dynamics','Apex Dynamics','old_town'),('blueharbor_media','Blueharbor Media','old_town'),('tidewatch_ops','Tidewatch Ops','rivergate'),('kestrel_works','Kestrel Works','rivergate'),('atlas_freight','Atlas Freight','east_quay'),('sunmesh_analytics','Sunmesh Analytics','sector9'),('orion_customs','Orion Customs','north_basin'),('emberline_security','Emberline Security','foundry_row'),('harborlight_transit','Harborlight Transit','uplink_yard')]
|
| 9 |
+
E=[('project_lantern','Project Lantern'),('black_kite','Black Kite'),('silent_current','Silent Current'),('amber_veil','Amber Veil'),('glass_harbor','Glass Harbor'),('ember_tide','Ember Tide'),('iron_wharf','Iron Wharf'),('ghost_signal','Ghost Signal')]
|
| 10 |
+
T=[('supply_leak','supply_chain'),('port_audit','port_audit'),('customs_breach','customs_breach'),('relay_map','relay_map'),('foundry_watch','foundry_watch'),('basin_shift','basin_shift'),('quiet_manifest','quiet_manifest'),('uplink_route','uplink_route'),('ember_tide_watch','ember_tide'),('ghost_signal_net','ghost_signal')]
|
| 11 |
+
P=['shift_roster','midnight_manifest','sat_phone_ping','drone_parts','relay_schedule','quay_ledgers','customs_tag','hull_signal','basin_photo','foundry_map','lantern_route','uplink_note']
|
| 12 |
+
|
| 13 |
+
def uid(x): return f'user_{x}'
|
| 14 |
+
def aid(x): return f'alias_{x}'
|
| 15 |
+
def oid(x): return f'org_{x}'
|
| 16 |
+
def lid(x): return f'loc_{x}'
|
| 17 |
+
def eid(x): return f'event_{x}'
|
| 18 |
+
def tid(x): return f'thr_{x}'
|
| 19 |
+
def pid(x): return f'post_{x}'
|
| 20 |
+
|
| 21 |
+
def addn(nodes,nid,nt,attrs): nodes.append({'node_id':nid,'node_type':nt,'attrs':attrs})
|
| 22 |
+
|
| 23 |
+
def build():
|
| 24 |
+
nodes=[]; edges=OrderedDict();
|
| 25 |
+
for s,name,org,loc in U: addn(nodes,uid(s),'user',{'name':name,'org':org,'location':loc})
|
| 26 |
+
for s,handle,user in A: addn(nodes,aid(s),'alias',{'handle':handle})
|
| 27 |
+
for s,name,_ in O: addn(nodes,oid(s),'org',{'name':name})
|
| 28 |
+
for s,name in L: addn(nodes,lid(s),'location',{'name':name})
|
| 29 |
+
for s,name in E: addn(nodes,eid(s),'event',{'name':name})
|
| 30 |
+
for s,topic in T: addn(nodes,tid(s),'thread',{'topic':topic})
|
| 31 |
+
for s in P: addn(nodes,pid(s),'post',{'channel':'microblog'})
|
| 32 |
+
def ae(k,src,rel,dst,c=1.0): edges[k]={'src':src,'rel':rel,'dst':dst,'confidence':c}
|
| 33 |
+
for s,_,user in A: ae(f'a_{s}',aid(s),'alias_of',uid(user))
|
| 34 |
+
org_map={name:oid(s) for s,name,_ in O}; loc_map={name:lid(s) for s,name in L}
|
| 35 |
+
for s,_,org,loc in U: ae(f'w_{s}',uid(s),'works_at',org_map[org]); ae(f'l_{s}',uid(s),'located_in',loc_map[loc])
|
| 36 |
+
for s,_,loc in O: ae(f'op_{s}',oid(s),'operates_in',lid(loc))
|
| 37 |
+
CP=[('ivy','bharat',.95),('bharat','hiro',.95),('hiro','faris',.92),('faris','diya',.90),('diya','elin',.89),('elin','aria',.87),('aria','cyrus',.84),('cyrus','gita',.83),('gita','jules',.82),('jules','bharat',.81),('diya','ivy',.90),('ivy','elin',.86),('kian','omar',.93),('omar','mika',.90),('mika','quinn',.89),('quinn','nora',.88),('nora','rhea',.87),('rhea','soren',.86),('soren','tara',.86),('tara','kian',.84),('priya','leena',.91),('leena','aria',.83),('priya','nora',.82),('kian','bharat',.80),('soren','faris',.79),('quinn','hiro',.78)]
|
| 38 |
+
for i,(a,b,c) in enumerate(CP,1): ae(f'c{i:02d}',uid(a),'connected_to',uid(b),c)
|
| 39 |
+
PA={'midnight_manifest':'orchidfox','shift_roster':'docksparrow','sat_phone_ping':'nightrelay','drone_parts':'monsoonbyte','relay_schedule':'steelquill','quay_ledgers':'lanternmoth','customs_tag':'basinraven','hull_signal':'tideshard','basin_photo':'emberglass','foundry_map':'cinderveil','lantern_route':'frostledger','uplink_note':'sablekeel'}
|
| 40 |
+
for post,author in PA.items(): ae(f'ap_{post}',aid(author),'authored_post',pid(post))
|
| 41 |
+
PR={'midnight_manifest':['dockyard17','project_lantern'],'shift_roster':['dockyard17','northbridge_logistics'],'sat_phone_ping':['rivergate','project_lantern'],'drone_parts':['black_kite','kestrel_works'],'relay_schedule':['project_lantern','sector9'],'quay_ledgers':['east_quay','glass_harbor'],'customs_tag':['north_basin','iron_wharf'],'hull_signal':['uplink_yard','ghost_signal'],'basin_photo':['foundry_row','amber_veil'],'foundry_map':['foundry_row','ember_tide'],'lantern_route':['project_lantern','sunmesh_analytics'],'uplink_note':['uplink_yard','harborlight_transit']}
|
| 42 |
+
for post,refs in PR.items():
|
| 43 |
+
for i,x in enumerate(refs,1): ae(f'r_{post}_{i}',pid(post),'references', lid(x) if x in {y for y,_ in L} else (oid(x) if x in {y for y,_,_ in O} else eid(x)))
|
| 44 |
+
TA={'supply_leak':'diya','port_audit':'jules','customs_breach':'mika','relay_map':'leena','foundry_watch':'nora','basin_shift':'quinn','quiet_manifest':'kian','uplink_route':'soren','ember_tide_watch':'rhea','ghost_signal_net':'tara'}
|
| 45 |
+
TL={'supply_leak':[('discusses','project_lantern'),('references','northbridge_logistics')],'port_audit':[('discusses','black_kite'),('references','kestrel_works')],'customs_breach':[('discusses','iron_wharf'),('references','orion_customs')],'relay_map':[('discusses','project_lantern'),('references','sunmesh_analytics')],'foundry_watch':[('discusses','ember_tide'),('references','emberline_security')],'basin_shift':[('discusses','amber_veil'),('references','north_basin')],'quiet_manifest':[('discusses','glass_harbor'),('references','atlas_freight')],'uplink_route':[('discusses','ghost_signal'),('references','harborlight_transit')],'ember_tide_watch':[('discusses','ember_tide'),('references','foundry_row')],'ghost_signal_net':[('discusses','ghost_signal'),('references','uplink_yard')]}
|
| 46 |
+
for t,u in TA.items(): ae(f'at_{t}',uid(u),'authored_thread',tid(t))
|
| 47 |
+
for t,rels in TL.items():
|
| 48 |
+
for i,(rel,x) in enumerate(rels,1): ae(f'tl_{t}_{i}',tid(t),rel, lid(x) if x in {y for y,_ in L} else (oid(x) if x in {y for y,_,_ in O} else eid(x)))
|
| 49 |
+
ER=[('bharat','collaborates_on','project_lantern'),('hiro','collaborates_on','project_lantern'),('faris','collaborates_on','project_lantern'),('diya','investigates','project_lantern'),('leena','monitors','project_lantern'),('ivy','collaborates_on','black_kite'),('cyrus','collaborates_on','black_kite'),('elin','investigates','black_kite'),('jules','reports_on','black_kite'),('kian','collaborates_on','glass_harbor'),('omar','collaborates_on','glass_harbor'),('priya','monitors','glass_harbor'),('mika','collaborates_on','iron_wharf'),('quinn','collaborates_on','iron_wharf'),('nora','investigates','amber_veil'),('rhea','collaborates_on','ember_tide'),('soren','collaborates_on','ghost_signal'),('tara','reports_on','ghost_signal'),('gita','monitors','silent_current'),('jules','reports_on','silent_current')]
|
| 50 |
+
for i,(u,rel,e) in enumerate(ER,1): ae(f'er{i:02d}',uid(u),rel,eid(e),.9)
|
| 51 |
+
X=[(eid('project_lantern'),'connected_to',eid('glass_harbor')),(eid('black_kite'),'connected_to',eid('amber_veil')),(eid('ember_tide'),'connected_to',eid('ghost_signal')),(oid('atlas_freight'),'connected_to',oid('northbridge_logistics')),(oid('orion_customs'),'connected_to',oid('emberline_security')),(oid('harborlight_transit'),'connected_to',oid('tidewatch_ops'))]
|
| 52 |
+
for i,(a,rel,b) in enumerate(X,1): ae(f'x{i:02d}',a,rel,b,.77)
|
| 53 |
+
return nodes,edges
|
| 54 |
+
|
| 55 |
+
def mk_questions(edges):
|
| 56 |
+
def ids(*items):
|
| 57 |
+
out=[]
|
| 58 |
+
for it in items:
|
| 59 |
+
if isinstance(it,list): out.extend(it)
|
| 60 |
+
else: out.append(it)
|
| 61 |
+
return out
|
| 62 |
+
def rng(prefix,a,b): return [f'{prefix}{i:02d}' for i in range(a,b+1)]
|
| 63 |
+
def sup(edge_ids): return [edges[e] for e in edge_ids]
|
| 64 |
+
def nodes(edge_ids):
|
| 65 |
+
s=set()
|
| 66 |
+
for e in edge_ids: s|={edges[e]['src'],edges[e]['dst']}
|
| 67 |
+
return len(s)
|
| 68 |
+
qs=[]
|
| 69 |
+
easy=[('easy_01','alias_orchidfox -> post_midnight_manifest -> loc_dockyard17 -> connected collaborator on event_project_lantern. Who is it?','user_bharat',ids('a_orchidfox','ap_midnight_manifest','r_midnight_manifest_1','c01','er01')),('easy_02','thr_supply_leak references org_northbridge_logistics. Which alias_docksparrow user works there and collaborates on event_project_lantern?','user_hiro',ids('tl_supply_leak_2','a_docksparrow','w_hiro','er02')),('easy_03','alias_monsoonbyte authored post_drone_parts about event_black_kite. Which user behind that alias is directly connected to the Kestrel collaborator?','user_diya',ids('a_monsoonbyte','ap_drone_parts','r_drone_parts_1','w_ivy','er06','c12')),('easy_04','alias_nightrelay references loc_rivergate. Which user behind it works at an org operating there and collaborates on event_project_lantern?','user_faris',ids('a_nightrelay','ap_sat_phone_ping','r_sat_phone_ping_1','w_faris','op_tidewatch_ops','er03')),('easy_05','thr_port_audit discusses Black Kite and references Kestrel Works. Which alias_orchidfox user authored post_midnight_manifest and collaborates on Black Kite?','user_ivy',ids('tl_port_audit_1','tl_port_audit_2','a_orchidfox','ap_midnight_manifest','w_ivy','er06')),('easy_06','Which Atlas Freight user behind alias_lanternmoth authored post_quay_ledgers and collaborates on event_glass_harbor?','user_kian',ids('a_lanternmoth','ap_quay_ledgers','w_kian','er10')),('easy_07','Which Orion Customs user behind alias_basinraven authored post_customs_tag and collaborates on event_iron_wharf?','user_mika',ids('a_basinraven','ap_customs_tag','w_mika','er13')),('easy_08','Which user behind alias_emberglass posted basin_photo from Foundry Row and investigates Amber Veil?','user_nora',ids('a_emberglass','ap_basin_photo','r_basin_photo_1','er15')),('easy_09','Which user behind alias_tideshard authored post_hull_signal and collaborates on Ghost Signal?','user_soren',ids('a_tideshard','ap_hull_signal','er17')),('easy_10','Which Harborlight Transit user behind alias_sablekeel authored post_uplink_note and reports on Ghost Signal?','user_tara',ids('a_sablekeel','ap_uplink_note','w_tara','er18'))]
|
| 70 |
+
mid=[('mid_01','Follow alias_docksparrow through post_shift_roster, Dockyard 17, and the Lantern chain. Return the org node id.','org_northbridge_logistics',ids('a_docksparrow','ap_shift_roster','r_shift_roster_1','r_shift_roster_2','tl_supply_leak_2','w_hiro','l_hiro','er02','er01','c02','c03')),('mid_02','Across the Glass Harbor cluster, which user behind alias_lanternmoth links to the Atlas Freight network from thr_quiet_manifest?','user_kian',ids('a_lanternmoth','ap_quay_ledgers','r_quay_ledgers_1','r_quay_ledgers_2','at_quiet_manifest','tl_quiet_manifest_1','tl_quiet_manifest_2','w_kian','w_omar','er10','er11','er12','c13','c14')),('mid_03','Trace alias_basinraven through post_customs_tag, thr_customs_breach, and the Orion Customs collaboration chain. Who is it?','user_mika',ids('a_basinraven','ap_customs_tag','r_customs_tag_1','r_customs_tag_2','at_customs_breach','tl_customs_breach_1','tl_customs_breach_2','w_mika','w_quinn','er13','er14','c15','c16','x05')),('mid_04','In the Ember Tide and Amber Veil overlap, which Foundry Row user behind alias_cinderveil collaborates on Ember Tide?','user_rhea',ids('a_cinderveil','ap_foundry_map','r_foundry_map_1','r_foundry_map_2','at_foundry_watch','tl_foundry_watch_1','tl_foundry_watch_2','at_ember_tide_watch','tl_ember_tide_watch_1','tl_ember_tide_watch_2','w_rhea','w_nora','er15','er16','c17','x03')),('mid_05','Follow alias_tideshard from post_hull_signal into thr_uplink_route and the Harborlight relay. Return the org node id.','org_harborlight_transit',ids('a_tideshard','ap_hull_signal','r_hull_signal_1','r_hull_signal_2','at_uplink_route','tl_uplink_route_1','tl_uplink_route_2','w_soren','w_tara','er17','er18','c18','c19','op_harborlight_transit','x06')),('mid_06','Which Sunmesh user behind alias_frostledger connects post_lantern_route to thr_relay_map and the Sector 9 monitoring chain?','user_leena',ids('a_frostledger','ap_lantern_route','r_lantern_route_1','r_lantern_route_2','at_relay_map','tl_relay_map_1','tl_relay_map_2','w_leena','w_priya','l_leena','op_sunmesh_analytics','er05','c21','c22')),('mid_07','Which user behind alias_emberglass is tied to Amber Veil after combining post_basin_photo, thr_basin_shift, and the Foundry Row investigation chain?','user_nora',ids('a_emberglass','ap_basin_photo','r_basin_photo_1','r_basin_photo_2','at_basin_shift','tl_basin_shift_1','tl_basin_shift_2','w_nora','w_quinn','l_nora','er15','c16','c17','x05')),('mid_08','Combine alias_orchidfox, post_midnight_manifest, thr_supply_leak, and the Lantern to Glass Harbor bridge. Which user starts that chain?','user_ivy',ids('a_orchidfox','ap_midnight_manifest','r_midnight_manifest_1','r_midnight_manifest_2','at_supply_leak','tl_supply_leak_1','tl_supply_leak_2','w_ivy','er06','c01','c12','x01','er10','er12')),('mid_09','Which user behind alias_monsoonbyte sits at the overlap of Blueharbor Media, Project Lantern, Black Kite, and the Ivy connection chain?','user_diya',ids('a_monsoonbyte','ap_drone_parts','r_drone_parts_1','at_supply_leak','tl_supply_leak_1','at_port_audit','tl_port_audit_1','w_diya','w_ivy','w_jules','er04','er06','er09','c04','c12')),('mid_10','Who is the Northbridge user behind alias_steelquill when combining post_relay_schedule, thr_supply_leak, Dockyard 17, and Lantern collaborator edges?','user_bharat',ids('a_steelquill','ap_relay_schedule','r_relay_schedule_1','r_relay_schedule_2','at_supply_leak','tl_supply_leak_1','tl_supply_leak_2','w_bharat','w_hiro','l_bharat','l_hiro','er01','er02','c01','c02'))]
|
| 71 |
+
big=list(edges.keys())[:58]
|
| 72 |
+
hard=[('high_01','Lantern to Glass Harbor handoff: identify the user behind alias_orchidfox after combining Lantern logistics, Dockyard links, and Atlas Freight bridge evidence.','user_ivy',ids('a_orchidfox','ap_midnight_manifest','r_midnight_manifest_1','r_midnight_manifest_2','at_supply_leak','tl_supply_leak_1','tl_supply_leak_2',['w_ivy','w_bharat','w_hiro','w_kian','w_omar'],['l_ivy','l_bharat','l_hiro','l_kian','l_omar'],['op_northbridge_logistics','op_kestrel_works','op_atlas_freight'],rng('c',1,3),['c12','c13','c14'],['er01','er02','er03','er06','er10','er11','er12'],'at_quiet_manifest','tl_quiet_manifest_1','tl_quiet_manifest_2','ap_quay_ledgers','r_quay_ledgers_1','r_quay_ledgers_2','x01','x04','a_lanternmoth','a_steelquill','a_docksparrow')),('high_02','North Basin to Foundry Row escalation: which user behind alias_basinraven anchors the Iron Wharf side before the Emberline handoff?','user_mika',ids('a_basinraven','ap_customs_tag','r_customs_tag_1','r_customs_tag_2','at_customs_breach','tl_customs_breach_1','tl_customs_breach_2','at_basin_shift','tl_basin_shift_1','tl_basin_shift_2','at_foundry_watch','tl_foundry_watch_1','tl_foundry_watch_2',['w_mika','w_quinn','w_nora','w_rhea'],['l_mika','l_quinn','l_nora','l_rhea'],['op_orion_customs','op_emberline_security'],['c15','c16','c17'],['er13','er14','er15','er16'],'ap_basin_photo','r_basin_photo_1','r_basin_photo_2','ap_foundry_map','r_foundry_map_1','r_foundry_map_2','x02','x03','x05','a_emberglass','a_cinderveil','c23','c24')),('high_03','Harborlight ghost-signal relay: identify the user behind alias_tideshard at the Harborlight / Tidewatch junction.','user_soren',ids('a_tideshard','ap_hull_signal','r_hull_signal_1','r_hull_signal_2','a_sablekeel','ap_uplink_note','r_uplink_note_1','r_uplink_note_2','at_uplink_route','tl_uplink_route_1','tl_uplink_route_2','at_ghost_signal_net','tl_ghost_signal_net_1','tl_ghost_signal_net_2',['w_soren','w_tara','w_faris'],['l_soren','l_tara','l_faris'],['op_harborlight_transit','op_tidewatch_ops'],['c18','c19','c20','c25'],['er03','er17','er18'],'ap_sat_phone_ping','r_sat_phone_ping_1','r_sat_phone_ping_2','at_supply_leak','tl_supply_leak_1','er01','er02','x03','x06','a_nightrelay')),('high_04','Blueharbor to Black Kite to Lantern overlap: which user is the Blueharbor origin behind alias_monsoonbyte?','user_diya',ids('a_monsoonbyte','ap_drone_parts','r_drone_parts_1','r_drone_parts_2','at_port_audit','tl_port_audit_1','tl_port_audit_2','at_supply_leak','tl_supply_leak_1','tl_supply_leak_2',['w_diya','w_jules','w_ivy','w_cyrus'],['l_diya','l_jules','l_ivy','l_cyrus'],['op_blueharbor_media','op_kestrel_works','op_apex_dynamics'],['c04','c08','c09','c12'],['er04','er06','er07','er08','er09'],'a_orchidfox','ap_midnight_manifest','r_midnight_manifest_2','x01','x02','at_relay_map','tl_relay_map_1','w_leena','er05')),('high_05','Sector 9 to Dockyard 17 full relay: which user behind alias_steelquill links the Northbridge chain and the Sunmesh monitoring bridge?','user_bharat',ids('a_steelquill','ap_relay_schedule','r_relay_schedule_1','r_relay_schedule_2','a_frostledger','ap_lantern_route','r_lantern_route_1','r_lantern_route_2','at_relay_map','tl_relay_map_1','tl_relay_map_2','at_supply_leak','tl_supply_leak_1','tl_supply_leak_2',['w_bharat','w_hiro','w_leena','w_priya','w_aria'],['l_bharat','l_hiro','l_leena','l_priya','l_aria'],['op_northbridge_logistics','op_sunmesh_analytics','op_helios_labs'],['c01','c02','c05','c06','c07','c21','c22'],['er01','er02','er05'],'x01','x04','a_docksparrow','a_mapleghost','a_hollowsignal')),('high_06','Foundry Row, North Basin, and Uplink Yard spread: identify the user behind alias_emberglass before the Harborlight relay takes over.','user_nora',ids('a_emberglass','ap_basin_photo','r_basin_photo_1','r_basin_photo_2','a_cinderveil','ap_foundry_map','r_foundry_map_1','r_foundry_map_2','a_sablekeel','ap_uplink_note','r_uplink_note_1','r_uplink_note_2','at_foundry_watch','tl_foundry_watch_1','tl_foundry_watch_2','at_ember_tide_watch','tl_ember_tide_watch_1','tl_ember_tide_watch_2','at_uplink_route','tl_uplink_route_1','tl_uplink_route_2',['w_nora','w_rhea','w_soren','w_tara'],['l_nora','l_rhea','l_soren','l_tara'],['op_emberline_security','op_harborlight_transit'],['c17','c18','c19'],['er15','er16','er17','er18'],'x03','x06')),('high_07','Freight and customs bridge: which Atlas Freight user behind alias_lanternmoth connects Glass Harbor with the Northbridge chain?','user_kian',ids('a_lanternmoth','ap_quay_ledgers','r_quay_ledgers_1','r_quay_ledgers_2','at_quiet_manifest','tl_quiet_manifest_1','tl_quiet_manifest_2',['w_kian','w_omar','w_bharat','w_hiro'],['l_kian','l_omar','l_bharat','l_hiro'],['op_atlas_freight','op_northbridge_logistics'],['c13','c14','c24','c02'],['er10','er11','er12','er01','er02'],'ap_shift_roster','r_shift_roster_1','r_shift_roster_2','ap_midnight_manifest','r_midnight_manifest_1','at_supply_leak','tl_supply_leak_2','x04','a_ironwhisper','a_steelquill','a_docksparrow')),('high_08','Black Kite, Amber Veil, and Iron Wharf overlap: which user behind alias_quartzlotus is the Apex-side collaborator?','user_cyrus',ids('a_quartzlotus','w_cyrus','l_cyrus','op_apex_dynamics','er07','at_port_audit','tl_port_audit_1','ap_drone_parts','r_drone_parts_1','er15','at_basin_shift','tl_basin_shift_1','er13','at_customs_breach','tl_customs_breach_1',['w_ivy','w_nora','w_mika','w_quinn'],['l_ivy','l_nora','l_mika','l_quinn'],['op_kestrel_works','op_emberline_security','op_orion_customs'],['c08','c12','c15','c16','c17'],'x02','x05','a_orchidfox','a_basinraven','a_emberglass')),('high_09','Ghost Signal and Ember Tide relay: which user behind alias_sablekeel is the Harborlight reporting endpoint?','user_tara',ids('a_sablekeel','ap_uplink_note','r_uplink_note_1','r_uplink_note_2','a_tideshard','ap_hull_signal','r_hull_signal_1','r_hull_signal_2','at_ghost_signal_net','tl_ghost_signal_net_1','tl_ghost_signal_net_2','at_uplink_route','tl_uplink_route_1','tl_uplink_route_2','at_ember_tide_watch','tl_ember_tide_watch_1','tl_ember_tide_watch_2',['w_tara','w_soren','w_rhea','w_nora'],['l_tara','l_soren','l_rhea','l_nora'],['op_harborlight_transit','op_emberline_security'],['c18','c19','c17'],['er16','er17','er18'],'x03','x06','a_cinderveil','a_emberglass')),('high_10','End-to-end benchmark sweep: across Lantern, Black Kite, Glass Harbor, Iron Wharf, Ember Tide, and Ghost Signal, which user behind alias_hollowsignal anchors the Sunmesh monitoring side?','user_priya',big)]
|
| 73 |
+
for diff,level,specs in [('easy',1,easy),('mid',2,mid),('high',3,hard)]:
|
| 74 |
+
for qid,q,a,eids in specs:
|
| 75 |
+
qs.append({'task_type':'fixed_trace','question':q,'answer':a,'supporting_edges':sup(eids),'metadata':{'difficulty':diff,'difficulty_level':level,'question_id':qid,'support_nodes':nodes(eids)}})
|
| 76 |
+
def edge_key(e): return (e['src'], e['rel'], e['dst'])
|
| 77 |
+
mid_pool = sup(ids('a_orchidfox','ap_midnight_manifest','r_midnight_manifest_1','r_midnight_manifest_2','a_lanternmoth','ap_quay_ledgers','r_quay_ledgers_1','r_quay_ledgers_2','a_basinraven','ap_customs_tag','r_customs_tag_1','r_customs_tag_2','a_tideshard','ap_hull_signal','r_hull_signal_1','at_supply_leak','tl_supply_leak_1','at_quiet_manifest','tl_quiet_manifest_1','er01','er02','er06','er10','c01','c02','c13'))
|
| 78 |
+
hard_pool = sup(list(edges.keys())[:120])
|
| 79 |
+
for q in qs:
|
| 80 |
+
current = {edge_key(e) for e in q['supporting_edges']}
|
| 81 |
+
diff = q['metadata']['difficulty']
|
| 82 |
+
if diff == 'mid':
|
| 83 |
+
pool = mid_pool
|
| 84 |
+
target = 17
|
| 85 |
+
elif diff == 'high':
|
| 86 |
+
pool = hard_pool
|
| 87 |
+
target = 50
|
| 88 |
+
else:
|
| 89 |
+
continue
|
| 90 |
+
for e in pool:
|
| 91 |
+
if q['metadata']['support_nodes'] >= target:
|
| 92 |
+
break
|
| 93 |
+
k = edge_key(e)
|
| 94 |
+
if k not in current:
|
| 95 |
+
q['supporting_edges'].append(dict(e))
|
| 96 |
+
current.add(k)
|
| 97 |
+
q['metadata']['support_nodes'] = len({n for edge in q['supporting_edges'] for n in (edge['src'], edge['dst'])})
|
| 98 |
+
return qs
|
| 99 |
+
|
| 100 |
+
def main():
|
| 101 |
+
nodes,edges=build(); questions=mk_questions(edges)
|
| 102 |
+
payload={'seeding':{'seeded_nodes':nodes,'seeded_edges':list(edges.values()),'seeded_questions':questions,'llm_generate_remaining_graph':True,'llm_generate_remaining_tasks':False,'llm_generated_edge_budget':48,'llm_generated_task_budget':0,'llm_generation_parallel':True,'llm_generation_workers':4,'llm_generation_retries':3,'allow_template_fallback_on_llm_failure':False}}
|
| 103 |
+
out=Path('datasets/fixed_levels/seed_fixed_levels.json'); out.write_text(json.dumps(payload,indent=2),encoding='utf-8')
|
| 104 |
+
counts=Counter(q['metadata']['difficulty'] for q in questions)
|
| 105 |
+
stats={k:sorted(q['metadata']['support_nodes'] for q in questions if q['metadata']['difficulty']==k) for k in ['easy','mid','high']}
|
| 106 |
+
print(json.dumps({'nodes':len(nodes),'edges':len(edges),'questions':len(questions),'difficulty_counts':dict(counts),'support_nodes':stats},indent=2))
|
| 107 |
+
|
| 108 |
+
if __name__=='__main__':
|
| 109 |
+
main()
|
scripts/run_openai_baseline.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from osint_env.baselines import OpenAIBaselineConfig, OpenAIBaselineRunner
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 11 |
+
parser = argparse.ArgumentParser(description="Run the reproducible OpenAI baseline on the fixed-level OSINT benchmark.")
|
| 12 |
+
parser.add_argument("--config", default="datasets/fixed_levels/shared_config_fixed_levels.json", help="Shared config JSON.")
|
| 13 |
+
parser.add_argument("--seed-file", default="datasets/fixed_levels/seed_fixed_levels.json", help="Fixed seed file JSON.")
|
| 14 |
+
parser.add_argument("--output", default="artifacts/baselines/openai_fixed_levels_latest.json", help="Baseline result JSON output path.")
|
| 15 |
+
parser.add_argument("--leaderboard", default="artifacts/baselines/openai_fixed_levels_leaderboard.json", help="Leaderboard JSON path.")
|
| 16 |
+
parser.add_argument("--dashboard", default="artifacts/baselines/openai_fixed_levels_dashboard.html", help="Dashboard HTML path.")
|
| 17 |
+
parser.add_argument("--run-name", default="openai_fixed_levels_baseline", help="Leaderboard run name.")
|
| 18 |
+
parser.add_argument("--model", default="gpt-5-nano", help="OpenAI chat model name.")
|
| 19 |
+
parser.add_argument("--openai-base-url", default="https://api.openai.com/v1", help="OpenAI-compatible base URL.")
|
| 20 |
+
parser.add_argument("--openai-api-key", default="", help="OpenAI API key override.")
|
| 21 |
+
parser.add_argument("--openai-api-key-env", default="OPENAI_API_KEY", help="Environment variable name for the API key.")
|
| 22 |
+
parser.add_argument("--episodes", type=int, default=30, help="Number of episodes to evaluate.")
|
| 23 |
+
parser.add_argument("--max-steps", type=int, default=8, help="Episode step budget to keep runs bounded.")
|
| 24 |
+
parser.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature.")
|
| 25 |
+
parser.add_argument("--max-tokens", type=int, default=256, help="Maximum completion tokens per step.")
|
| 26 |
+
parser.add_argument("--timeout-seconds", type=int, default=60, help="Per-request timeout.")
|
| 27 |
+
parser.add_argument("--seed", type=int, default=7, help="Request seed offset used for repeatable runs.")
|
| 28 |
+
parser.add_argument("--skip-leaderboard", action="store_true", help="Do not append the run to the leaderboard file.")
|
| 29 |
+
return parser
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main() -> None:
|
| 33 |
+
args = build_parser().parse_args()
|
| 34 |
+
api_key = args.openai_api_key or os.getenv(args.openai_api_key_env, "")
|
| 35 |
+
config = OpenAIBaselineConfig(
|
| 36 |
+
shared_config_path=args.config,
|
| 37 |
+
seed_file=args.seed_file,
|
| 38 |
+
output_path=args.output,
|
| 39 |
+
leaderboard_path=args.leaderboard,
|
| 40 |
+
dashboard_path=args.dashboard,
|
| 41 |
+
run_name=args.run_name,
|
| 42 |
+
model=args.model,
|
| 43 |
+
base_url=args.openai_base_url,
|
| 44 |
+
api_key=api_key,
|
| 45 |
+
api_key_env=args.openai_api_key_env,
|
| 46 |
+
temperature=args.temperature,
|
| 47 |
+
max_tokens=args.max_tokens,
|
| 48 |
+
timeout_seconds=args.timeout_seconds,
|
| 49 |
+
episodes=args.episodes,
|
| 50 |
+
max_steps=args.max_steps,
|
| 51 |
+
seed=args.seed,
|
| 52 |
+
append_leaderboard=not args.skip_leaderboard,
|
| 53 |
+
)
|
| 54 |
+
result = OpenAIBaselineRunner(config).run()
|
| 55 |
+
print(json.dumps({"summary": result["summary"], "output": args.output, "dashboard": args.dashboard}, indent=2, sort_keys=True))
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
main()
|
scripts/space_start.sh
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/sh
|
| 2 |
+
set -eu
|
| 3 |
+
|
| 4 |
+
_is_true() {
|
| 5 |
+
case "${1:-}" in
|
| 6 |
+
1|true|TRUE|yes|YES|on|ON) return 0 ;;
|
| 7 |
+
*) return 1 ;;
|
| 8 |
+
esac
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
ENV_CONFIG_PATH="${TRAIN_ENV_CONFIG_PATH:-config/shared_config.json}"
|
| 12 |
+
TRAIN_CONFIG_PATH="${TRAIN_SELF_PLAY_CONFIG_PATH:-config/self_play_training_hf_a10g_smoke.json}"
|
| 13 |
+
RUN_FLAG="${RUN_SELF_PLAY_TRAINING:-0}"
|
| 14 |
+
DRY_RUN_FLAG="${RUN_SELF_PLAY_DRY_RUN:-0}"
|
| 15 |
+
|
| 16 |
+
if _is_true "$RUN_FLAG"; then
|
| 17 |
+
echo "[space_start] RUN_SELF_PLAY_TRAINING enabled."
|
| 18 |
+
echo "[space_start] Training start: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
| 19 |
+
echo "[space_start] Env config: ${ENV_CONFIG_PATH}"
|
| 20 |
+
echo "[space_start] Train config: ${TRAIN_CONFIG_PATH}"
|
| 21 |
+
if _is_true "$DRY_RUN_FLAG"; then
|
| 22 |
+
echo "[space_start] Running self-play in dry-run mode."
|
| 23 |
+
osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}" --dry-run
|
| 24 |
+
else
|
| 25 |
+
echo "[space_start] Running self-play training."
|
| 26 |
+
osint-env train-self-play --config "${ENV_CONFIG_PATH}" --train-config "${TRAIN_CONFIG_PATH}"
|
| 27 |
+
fi
|
| 28 |
+
echo "[space_start] Self-play command completed."
|
| 29 |
+
echo "[space_start] Training end: $(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
| 30 |
+
else
|
| 31 |
+
echo "[space_start] RUN_SELF_PLAY_TRAINING disabled. Skipping self-play run."
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
echo "[space_start] Starting API server."
|
| 35 |
+
exec uvicorn server:app --host 0.0.0.0 --port "${PORT:-7860}"
|
scripts/test_ollama_space.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
import requests
|
| 9 |
+
|
| 10 |
+
from osint_env.baselines.openai_runner import SYSTEM_PROMPT, build_action_tools
|
| 11 |
+
from osint_env.llm.interface import OllamaLLMClient
|
| 12 |
+
|
| 13 |
+
SPACE_URL = os.getenv("SPACE_URL", "https://siddeshwar1625-osint.hf.space").rstrip("/")
|
| 14 |
+
OLLAMA_BASE = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
|
| 15 |
+
MODEL = os.getenv("OLLAMA_MODEL", "qwen3:2b")
|
| 16 |
+
MAX_STEPS = int(os.getenv("MAX_STEPS", "8"))
|
| 17 |
+
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "90"))
|
| 18 |
+
TASK_INDICES = [int(x.strip()) for x in os.getenv("TASK_INDICES", "0").split(",") if x.strip()]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _message_text(message: Any) -> str:
|
| 22 |
+
content = getattr(message, "content", "")
|
| 23 |
+
if isinstance(content, str):
|
| 24 |
+
return content
|
| 25 |
+
if isinstance(content, list):
|
| 26 |
+
parts: list[str] = []
|
| 27 |
+
for item in content:
|
| 28 |
+
if isinstance(item, dict) and item.get("type") == "text":
|
| 29 |
+
parts.append(str(item.get("text", "")))
|
| 30 |
+
return "\n".join(part for part in parts if part)
|
| 31 |
+
return str(content or "")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _assistant_tool_call_id(message: dict[str, Any]) -> str | None:
|
| 35 |
+
tool_calls = list(message.get("tool_calls", []))
|
| 36 |
+
if not tool_calls:
|
| 37 |
+
return None
|
| 38 |
+
tool_call_id = tool_calls[0].get("id")
|
| 39 |
+
return str(tool_call_id) if tool_call_id else None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _tool_result_message(assistant_message: dict[str, Any], result: dict[str, Any]) -> dict[str, Any] | None:
|
| 43 |
+
tool_call_id = _assistant_tool_call_id(assistant_message)
|
| 44 |
+
if not tool_call_id:
|
| 45 |
+
return None
|
| 46 |
+
return {
|
| 47 |
+
"role": "tool",
|
| 48 |
+
"tool_call_id": tool_call_id,
|
| 49 |
+
"content": json.dumps(result, sort_keys=True),
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _decode_action(tool_name: str, args: dict[str, Any]) -> dict[str, Any]:
|
| 54 |
+
if tool_name == "submit_answer":
|
| 55 |
+
return {"action_type": "ANSWER", "payload": {"answer": str(args.get("answer", "")).strip()}}
|
| 56 |
+
if tool_name == "add_edge":
|
| 57 |
+
return {
|
| 58 |
+
"action_type": "ADD_EDGE",
|
| 59 |
+
"payload": {
|
| 60 |
+
"src": str(args.get("src", "")).strip(),
|
| 61 |
+
"rel": str(args.get("rel", "")).strip(),
|
| 62 |
+
"dst": str(args.get("dst", "")).strip(),
|
| 63 |
+
"confidence": float(args.get("confidence", 1.0)),
|
| 64 |
+
},
|
| 65 |
+
}
|
| 66 |
+
return {"action_type": "CALL_TOOL", "payload": {"tool_name": tool_name, "args": dict(args)}}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _format_action(action: dict[str, Any]) -> str:
|
| 70 |
+
action_type = str(action.get("action_type", ""))
|
| 71 |
+
payload = dict(action.get("payload", {}))
|
| 72 |
+
if action_type == "ANSWER":
|
| 73 |
+
return f"answer({payload.get('answer', 'unknown')})"
|
| 74 |
+
if action_type == "ADD_EDGE":
|
| 75 |
+
return (
|
| 76 |
+
"add_edge("
|
| 77 |
+
f"{payload.get('src', '')},"
|
| 78 |
+
f"{payload.get('rel', '')},"
|
| 79 |
+
f"{payload.get('dst', '')},"
|
| 80 |
+
f"{float(payload.get('confidence', 1.0)):.2f}"
|
| 81 |
+
")"
|
| 82 |
+
)
|
| 83 |
+
tool_name = str(payload.get("tool_name", "tool"))
|
| 84 |
+
args = dict(payload.get("args", {}))
|
| 85 |
+
if not args:
|
| 86 |
+
return f"{tool_name}()"
|
| 87 |
+
arg_str = ",".join(f"{key}={value}" for key, value in sorted(args.items()))
|
| 88 |
+
return f"{tool_name}({arg_str})"
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def get_model_action(client: OllamaLLMClient, messages: list[dict[str, Any]], tools: list[dict[str, Any]]) -> tuple[dict[str, Any], dict[str, Any]]:
|
| 92 |
+
llm_resp = client.generate(messages, tools)
|
| 93 |
+
content = llm_resp.content or ""
|
| 94 |
+
tool_calls = list(llm_resp.tool_calls or [])
|
| 95 |
+
if not tool_calls:
|
| 96 |
+
return {"action_type": "ANSWER", "payload": {"answer": content.strip() or "unknown"}}, {
|
| 97 |
+
"role": "assistant",
|
| 98 |
+
"content": content,
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
tool_call = tool_calls[0]
|
| 102 |
+
tool_name = str(tool_call.get("tool_name", ""))
|
| 103 |
+
args = dict(tool_call.get("args", {}))
|
| 104 |
+
assistant_message = {
|
| 105 |
+
"role": "assistant",
|
| 106 |
+
"content": content,
|
| 107 |
+
"tool_calls": [
|
| 108 |
+
{
|
| 109 |
+
"id": "local",
|
| 110 |
+
"type": "function",
|
| 111 |
+
"function": {"name": tool_name, "arguments": json.dumps(args, sort_keys=True)},
|
| 112 |
+
}
|
| 113 |
+
],
|
| 114 |
+
}
|
| 115 |
+
return _decode_action(tool_name, args), assistant_message
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def main() -> None:
|
| 119 |
+
try:
|
| 120 |
+
ping = requests.get(f"{SPACE_URL}/healthz", timeout=REQUEST_TIMEOUT)
|
| 121 |
+
ping.raise_for_status()
|
| 122 |
+
print(f"Space health: {ping.json()}")
|
| 123 |
+
except Exception as exc:
|
| 124 |
+
raise SystemExit(f"Space health check failed: {exc}") from exc
|
| 125 |
+
|
| 126 |
+
client = OllamaLLMClient(model=MODEL, base_url=OLLAMA_BASE, timeout_seconds=REQUEST_TIMEOUT)
|
| 127 |
+
tools = build_action_tools()
|
| 128 |
+
|
| 129 |
+
for task_index in TASK_INDICES:
|
| 130 |
+
print(f"Resetting task {task_index} via {SPACE_URL}/openenv/reset")
|
| 131 |
+
resp = requests.post(f"{SPACE_URL}/openenv/reset", json={"task_index": task_index}, timeout=REQUEST_TIMEOUT)
|
| 132 |
+
resp.raise_for_status()
|
| 133 |
+
data = resp.json()
|
| 134 |
+
session_id = str(data.get("session_id"))
|
| 135 |
+
observation = data.get("observation", {})
|
| 136 |
+
|
| 137 |
+
messages: list[dict[str, Any]] = [
|
| 138 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 139 |
+
{"role": "user", "content": json.dumps(observation, indent=2, sort_keys=True)},
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
done = bool(data.get("done", False))
|
| 143 |
+
step = 0
|
| 144 |
+
rewards: list[float] = []
|
| 145 |
+
|
| 146 |
+
while not done and step < MAX_STEPS:
|
| 147 |
+
step += 1
|
| 148 |
+
action, assistant_message = get_model_action(client, messages, tools)
|
| 149 |
+
error = None
|
| 150 |
+
try:
|
| 151 |
+
result = requests.post(
|
| 152 |
+
f"{SPACE_URL}/openenv/step",
|
| 153 |
+
json={
|
| 154 |
+
"session_id": session_id,
|
| 155 |
+
"action_type": action["action_type"],
|
| 156 |
+
"payload": action["payload"],
|
| 157 |
+
},
|
| 158 |
+
timeout=REQUEST_TIMEOUT,
|
| 159 |
+
)
|
| 160 |
+
result.raise_for_status()
|
| 161 |
+
result = result.json()
|
| 162 |
+
except Exception as exc:
|
| 163 |
+
error = str(exc)
|
| 164 |
+
print(f"Step {step}: request failed: {error}")
|
| 165 |
+
break
|
| 166 |
+
|
| 167 |
+
reward = float(result.get("reward", 0.0) or 0.0)
|
| 168 |
+
done = bool(result.get("done", False))
|
| 169 |
+
rewards.append(reward)
|
| 170 |
+
print(f"Step {step}: action={_format_action(action)} reward={reward:.3f} done={done} error={error}")
|
| 171 |
+
|
| 172 |
+
messages.append(assistant_message)
|
| 173 |
+
tool_message = _tool_result_message(assistant_message, result)
|
| 174 |
+
if tool_message is not None:
|
| 175 |
+
messages.append(tool_message)
|
| 176 |
+
|
| 177 |
+
print(f"Episode finished. steps={step} total_reward={sum(rewards):.3f} rewards={rewards}")
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
try:
|
| 182 |
+
main()
|
| 183 |
+
except KeyboardInterrupt:
|
| 184 |
+
print("Interrupted", file=sys.stderr)
|
| 185 |
+
sys.exit(1)
|
scripts/validate_release.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 8 |
+
if str(ROOT) not in sys.path:
|
| 9 |
+
sys.path.insert(0, str(ROOT))
|
| 10 |
+
|
| 11 |
+
from osint_env.validation import run_validation_suite
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main() -> int:
|
| 15 |
+
result = run_validation_suite()
|
| 16 |
+
print(json.dumps(result, indent=2, sort_keys=True))
|
| 17 |
+
return 0 if result["passed"] else 1
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
raise SystemExit(main())
|
server.py
ADDED
|
@@ -0,0 +1,564 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from threading import Lock
|
| 9 |
+
from typing import Any
|
| 10 |
+
from uuid import uuid4
|
| 11 |
+
|
| 12 |
+
from fastapi import FastAPI, HTTPException, Request
|
| 13 |
+
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
| 14 |
+
|
| 15 |
+
from osint_env.api import (
|
| 16 |
+
OpenEnvActionRequest,
|
| 17 |
+
OpenEnvInferenceReportRequest,
|
| 18 |
+
OpenEnvInferenceReportResponse,
|
| 19 |
+
OpenEnvObservationModel,
|
| 20 |
+
OpenEnvResetRequest,
|
| 21 |
+
OpenEnvResponseEnvelope,
|
| 22 |
+
OpenEnvTaskSummary,
|
| 23 |
+
)
|
| 24 |
+
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 25 |
+
from osint_env.domain.models import Action, ActionType
|
| 26 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 27 |
+
from osint_env.eval.leaderboard import load_leaderboard
|
| 28 |
+
from osint_env.eval.runner import run_evaluation
|
| 29 |
+
from osint_env.llm import build_llm_client
|
| 30 |
+
from osint_env.viz import export_dashboard
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
SPACE_CONFIG_PATH = Path(os.getenv("OSINT_ENV_CONFIG", "datasets/fixed_levels/shared_config_fixed_levels.json"))
|
| 34 |
+
SPACE_SEED_PATH = Path(os.getenv("OSINT_ENV_SEED_FILE", "datasets/fixed_levels/seed_fixed_levels.json"))
|
| 35 |
+
SPACE_PROVIDER = os.getenv("OSINT_SPACE_LLM_PROVIDER", "mock")
|
| 36 |
+
SPACE_MODEL = os.getenv("OSINT_SPACE_LLM_MODEL", "gpt-4o-mini")
|
| 37 |
+
SPACE_PORT = int(os.getenv("PORT", "7860"))
|
| 38 |
+
SPACE_DASHBOARD = Path("artifacts/space_dashboard.html")
|
| 39 |
+
LATEST_BASELINE_OUTPUT = Path("artifacts/baselines/openai_fixed_levels_latest.json")
|
| 40 |
+
LATEST_EVALUATION_OUTPUT = Path("artifacts/latest_evaluation.json")
|
| 41 |
+
OPENENV_SPEC_PATH = Path("openenv.yaml")
|
| 42 |
+
|
| 43 |
+
_SESSION_LOCK = Lock()
|
| 44 |
+
_SESSIONS: dict[str, OSINTEnvironment] = {}
|
| 45 |
+
_RESET_COUNTER = 0
|
| 46 |
+
_LATEST_SESSION_ID: str | None = None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _load_json(path: Path) -> dict[str, Any] | None:
|
| 50 |
+
if not path.exists():
|
| 51 |
+
return None
|
| 52 |
+
try:
|
| 53 |
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
| 54 |
+
except (OSError, json.JSONDecodeError):
|
| 55 |
+
return None
|
| 56 |
+
return payload if isinstance(payload, dict) else None
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _path_mtime(path: Path) -> float:
|
| 60 |
+
try:
|
| 61 |
+
return path.stat().st_mtime
|
| 62 |
+
except OSError:
|
| 63 |
+
return 0.0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _build_environment() -> OSINTEnvironment:
|
| 67 |
+
shared = load_shared_config(SPACE_CONFIG_PATH)
|
| 68 |
+
env_cfg = clone_environment_config(shared.environment)
|
| 69 |
+
if SPACE_SEED_PATH.exists():
|
| 70 |
+
env_cfg.seeding = load_seeding_config(SPACE_SEED_PATH)
|
| 71 |
+
env_cfg.llm.provider = SPACE_PROVIDER
|
| 72 |
+
env_cfg.llm.model = SPACE_MODEL
|
| 73 |
+
try:
|
| 74 |
+
llm = build_llm_client(env_cfg.llm)
|
| 75 |
+
except Exception:
|
| 76 |
+
env_cfg.llm.provider = "mock"
|
| 77 |
+
llm = build_llm_client(env_cfg.llm)
|
| 78 |
+
return OSINTEnvironment(env_cfg, llm=llm)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _serialize_observation(observation: Any) -> OpenEnvObservationModel:
|
| 82 |
+
return OpenEnvObservationModel(
|
| 83 |
+
tool_outputs=list(observation.tool_outputs),
|
| 84 |
+
graph_snapshot=dict(observation.graph_snapshot),
|
| 85 |
+
action_history=list(observation.action_history),
|
| 86 |
+
task=dict(observation.task),
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _safe_session_info(info: dict[str, Any]) -> dict[str, Any]:
|
| 91 |
+
return {
|
| 92 |
+
"step_count": int(info.get("step_count", 0)),
|
| 93 |
+
"total_reward": float(info.get("total_reward", 0.0)),
|
| 94 |
+
"tool_calls": int(info.get("tool_calls", 0)),
|
| 95 |
+
"redundant_tool_calls": int(info.get("redundant_tool_calls", 0)),
|
| 96 |
+
"task_answer": str(info.get("task_answer", "")),
|
| 97 |
+
"agent_answer": "" if info.get("agent_answer") is None else str(info.get("agent_answer", "")),
|
| 98 |
+
"graph_f1": float(info.get("graph_f1", 0.0)),
|
| 99 |
+
"reward_components": dict(info.get("reward_components", {})),
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _task_summaries(env: OSINTEnvironment) -> list[OpenEnvTaskSummary]:
|
| 104 |
+
return [
|
| 105 |
+
OpenEnvTaskSummary(
|
| 106 |
+
task_id=task.task_id,
|
| 107 |
+
task_type=task.task_type,
|
| 108 |
+
question=task.question,
|
| 109 |
+
difficulty=str(task.metadata.get("difficulty", "unknown")),
|
| 110 |
+
grader=(
|
| 111 |
+
dict(task.metadata.get("grader", {}))
|
| 112 |
+
if isinstance(task.metadata.get("grader"), dict)
|
| 113 |
+
else {
|
| 114 |
+
"type": "exact_match",
|
| 115 |
+
"answer_type": "node_id",
|
| 116 |
+
"case_sensitive": True,
|
| 117 |
+
}
|
| 118 |
+
),
|
| 119 |
+
)
|
| 120 |
+
for task in env.tasks
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _resolve_task_index(env: OSINTEnvironment, request: OpenEnvResetRequest) -> int:
|
| 125 |
+
global _RESET_COUNTER
|
| 126 |
+
if request.task_index is not None:
|
| 127 |
+
task_index = int(request.task_index)
|
| 128 |
+
if task_index < 0 or task_index >= len(env.tasks):
|
| 129 |
+
raise HTTPException(status_code=400, detail=f"Invalid task_index {task_index}")
|
| 130 |
+
return task_index
|
| 131 |
+
if request.task_id:
|
| 132 |
+
for idx, task in enumerate(env.tasks):
|
| 133 |
+
if task.task_id == request.task_id:
|
| 134 |
+
return idx
|
| 135 |
+
raise HTTPException(status_code=400, detail=f"Unknown task_id {request.task_id}")
|
| 136 |
+
with _SESSION_LOCK:
|
| 137 |
+
task_index = _RESET_COUNTER % max(1, len(env.tasks))
|
| 138 |
+
_RESET_COUNTER += 1
|
| 139 |
+
return task_index
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _get_session_env(session_id: str) -> OSINTEnvironment:
|
| 143 |
+
with _SESSION_LOCK:
|
| 144 |
+
env = _SESSIONS.get(session_id)
|
| 145 |
+
if env is None:
|
| 146 |
+
raise HTTPException(status_code=404, detail=f"Unknown session_id {session_id}")
|
| 147 |
+
return env
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _store_session(session_id: str, env: OSINTEnvironment) -> None:
|
| 151 |
+
global _LATEST_SESSION_ID
|
| 152 |
+
with _SESSION_LOCK:
|
| 153 |
+
_SESSIONS[session_id] = env
|
| 154 |
+
_LATEST_SESSION_ID = session_id
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _latest_session_id() -> str:
|
| 158 |
+
with _SESSION_LOCK:
|
| 159 |
+
if _LATEST_SESSION_ID and _LATEST_SESSION_ID in _SESSIONS:
|
| 160 |
+
return _LATEST_SESSION_ID
|
| 161 |
+
if _SESSIONS:
|
| 162 |
+
return next(reversed(_SESSIONS))
|
| 163 |
+
raise HTTPException(status_code=404, detail="No active session. Call /reset first.")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _resolve_session_id(session_id: str | None) -> str:
|
| 167 |
+
token = str(session_id or "").strip()
|
| 168 |
+
if token:
|
| 169 |
+
return token
|
| 170 |
+
return _latest_session_id()
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def _task_lookup(env: OSINTEnvironment) -> dict[str, Any]:
|
| 174 |
+
return {task.task_id: task for task in env.tasks}
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _normalize_episode_rows(env: OSINTEnvironment, episodes: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
| 178 |
+
tasks_by_id = _task_lookup(env)
|
| 179 |
+
normalized: list[dict[str, Any]] = []
|
| 180 |
+
for episode in episodes:
|
| 181 |
+
row = dict(episode)
|
| 182 |
+
task = tasks_by_id.get(str(row.get("task_id", "")))
|
| 183 |
+
if task is not None:
|
| 184 |
+
row.setdefault("task_type", task.task_type)
|
| 185 |
+
row.setdefault("question", task.question)
|
| 186 |
+
row.setdefault("task_answer", task.answer)
|
| 187 |
+
row.setdefault(
|
| 188 |
+
"truth_edges",
|
| 189 |
+
[
|
| 190 |
+
{
|
| 191 |
+
"src": edge.src,
|
| 192 |
+
"rel": edge.rel,
|
| 193 |
+
"dst": edge.dst,
|
| 194 |
+
"confidence": float(edge.confidence),
|
| 195 |
+
}
|
| 196 |
+
for edge in task.supporting_edges
|
| 197 |
+
],
|
| 198 |
+
)
|
| 199 |
+
row.setdefault("pred_edges", [])
|
| 200 |
+
row.setdefault("reward_components", {})
|
| 201 |
+
row.setdefault("graph_f1", 0.0)
|
| 202 |
+
row.setdefault("reward", 0.0)
|
| 203 |
+
row.setdefault("steps", 0)
|
| 204 |
+
row.setdefault("tool_calls", 0)
|
| 205 |
+
row.setdefault("success", 0)
|
| 206 |
+
normalized.append(row)
|
| 207 |
+
return normalized
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
@lru_cache(maxsize=1)
|
| 211 |
+
def _base_environment_snapshot() -> dict[str, Any]:
|
| 212 |
+
env = _build_environment()
|
| 213 |
+
difficulty_counts = Counter(str(task.metadata.get("difficulty", "unknown")) for task in env.tasks)
|
| 214 |
+
return {
|
| 215 |
+
"task_count": len(env.tasks),
|
| 216 |
+
"difficulty_counts": dict(difficulty_counts),
|
| 217 |
+
"action_space": ["CALL_TOOL", "ADD_EDGE", "ANSWER"],
|
| 218 |
+
"observation_space": {
|
| 219 |
+
"tool_outputs": "Last tool results and memory hits.",
|
| 220 |
+
"graph_snapshot": "Current working graph edge snapshot.",
|
| 221 |
+
"action_history": "Recent action/reward trace.",
|
| 222 |
+
"task": "Task id, task type, and question.",
|
| 223 |
+
},
|
| 224 |
+
"task_types": sorted({task.task_type for task in env.tasks}),
|
| 225 |
+
"config": {
|
| 226 |
+
"seed": env.config.seed,
|
| 227 |
+
"max_steps": env.config.max_steps,
|
| 228 |
+
"swarm_enabled": env.config.swarm.enabled,
|
| 229 |
+
"llm_provider": env.config.llm.provider,
|
| 230 |
+
"llm_model": env.config.llm.model,
|
| 231 |
+
},
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
@lru_cache(maxsize=1)
|
| 236 |
+
def _preview_snapshot() -> dict[str, Any]:
|
| 237 |
+
env = _build_environment()
|
| 238 |
+
evaluation = run_evaluation(env, episodes=3, return_details=True, llm=build_llm_client(env.config.llm))
|
| 239 |
+
dashboard_path = export_dashboard(
|
| 240 |
+
env=env,
|
| 241 |
+
evaluation=evaluation,
|
| 242 |
+
leaderboard_records=[],
|
| 243 |
+
output_path=str(SPACE_DASHBOARD),
|
| 244 |
+
)
|
| 245 |
+
snapshot = dict(_base_environment_snapshot())
|
| 246 |
+
snapshot["summary"] = evaluation["summary"]
|
| 247 |
+
snapshot["dashboard_path"] = dashboard_path
|
| 248 |
+
return snapshot
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def _space_snapshot() -> dict[str, Any]:
|
| 252 |
+
snapshot = dict(_base_environment_snapshot())
|
| 253 |
+
|
| 254 |
+
baseline_payload = _load_json(LATEST_BASELINE_OUTPUT)
|
| 255 |
+
evaluation_payload = _load_json(LATEST_EVALUATION_OUTPUT)
|
| 256 |
+
|
| 257 |
+
candidates: list[tuple[float, str, dict[str, Any]]] = []
|
| 258 |
+
if baseline_payload is not None and isinstance(baseline_payload.get("summary"), dict):
|
| 259 |
+
candidates.append((_path_mtime(LATEST_BASELINE_OUTPUT), "baseline_output", baseline_payload))
|
| 260 |
+
if evaluation_payload is not None and isinstance(evaluation_payload.get("summary"), dict):
|
| 261 |
+
candidates.append((_path_mtime(LATEST_EVALUATION_OUTPUT), "latest_evaluation", evaluation_payload))
|
| 262 |
+
|
| 263 |
+
if candidates:
|
| 264 |
+
_, source, payload = max(candidates, key=lambda item: item[0])
|
| 265 |
+
snapshot["summary"] = dict(payload["summary"])
|
| 266 |
+
snapshot["source"] = source
|
| 267 |
+
if source == "baseline_output":
|
| 268 |
+
dashboard_path = Path(
|
| 269 |
+
str(
|
| 270 |
+
((payload.get("run") or {}).get("dashboard_path"))
|
| 271 |
+
or "artifacts/baselines/openai_fixed_levels_dashboard.html"
|
| 272 |
+
)
|
| 273 |
+
)
|
| 274 |
+
if dashboard_path.exists():
|
| 275 |
+
snapshot["dashboard_path"] = str(dashboard_path)
|
| 276 |
+
return snapshot
|
| 277 |
+
|
| 278 |
+
env = _build_environment()
|
| 279 |
+
dashboard_path = export_dashboard(
|
| 280 |
+
env=env,
|
| 281 |
+
evaluation=payload,
|
| 282 |
+
leaderboard_records=[],
|
| 283 |
+
output_path=str(SPACE_DASHBOARD),
|
| 284 |
+
)
|
| 285 |
+
snapshot["dashboard_path"] = dashboard_path
|
| 286 |
+
return snapshot
|
| 287 |
+
|
| 288 |
+
preview = _preview_snapshot()
|
| 289 |
+
preview["source"] = "preview"
|
| 290 |
+
return preview
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
app = FastAPI(title="OSINT OpenEnv Space", version="0.1.0")
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
@app.get("/", response_class=HTMLResponse)
|
| 297 |
+
def home() -> str:
|
| 298 |
+
snapshot = _space_snapshot()
|
| 299 |
+
summary = snapshot["summary"]
|
| 300 |
+
difficulty_html = "".join(
|
| 301 |
+
f"<li><strong>{level}</strong>: {count}</li>"
|
| 302 |
+
for level, count in sorted(snapshot["difficulty_counts"].items())
|
| 303 |
+
)
|
| 304 |
+
task_type_html = "".join(f"<li>{task_type}</li>" for task_type in snapshot["task_types"])
|
| 305 |
+
return f"""<!doctype html>
|
| 306 |
+
<html lang="en">
|
| 307 |
+
<head>
|
| 308 |
+
<meta charset="utf-8" />
|
| 309 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 310 |
+
<title>OSINT OpenEnv Space</title>
|
| 311 |
+
<style>
|
| 312 |
+
:root {{
|
| 313 |
+
--ink: #13212d;
|
| 314 |
+
--muted: #4d5b69;
|
| 315 |
+
--line: #d8e2eb;
|
| 316 |
+
--card: #ffffff;
|
| 317 |
+
--bg: #f6fafc;
|
| 318 |
+
--brand: #0f766e;
|
| 319 |
+
--accent: #b45309;
|
| 320 |
+
}}
|
| 321 |
+
* {{ box-sizing: border-box; }}
|
| 322 |
+
body {{
|
| 323 |
+
margin: 0;
|
| 324 |
+
font-family: "Segoe UI", sans-serif;
|
| 325 |
+
color: var(--ink);
|
| 326 |
+
background:
|
| 327 |
+
radial-gradient(circle at top left, rgba(15,118,110,0.12), transparent 30%),
|
| 328 |
+
radial-gradient(circle at top right, rgba(180,83,9,0.10), transparent 28%),
|
| 329 |
+
var(--bg);
|
| 330 |
+
}}
|
| 331 |
+
.wrap {{ max-width: 1120px; margin: 0 auto; padding: 24px; }}
|
| 332 |
+
.hero, .grid {{ display: grid; gap: 16px; }}
|
| 333 |
+
.hero {{ grid-template-columns: 1.5fr 1fr; }}
|
| 334 |
+
.grid {{ grid-template-columns: repeat(3, minmax(0, 1fr)); margin-top: 16px; }}
|
| 335 |
+
.card {{
|
| 336 |
+
background: var(--card);
|
| 337 |
+
border: 1px solid var(--line);
|
| 338 |
+
border-radius: 18px;
|
| 339 |
+
padding: 18px;
|
| 340 |
+
box-shadow: 0 12px 24px rgba(19, 33, 45, 0.06);
|
| 341 |
+
}}
|
| 342 |
+
h1, h2 {{ margin-top: 0; }}
|
| 343 |
+
.muted {{ color: var(--muted); }}
|
| 344 |
+
.stats {{ display: grid; grid-template-columns: repeat(2, minmax(0, 1fr)); gap: 10px; }}
|
| 345 |
+
.stat {{ border: 1px dashed var(--line); border-radius: 12px; padding: 10px; }}
|
| 346 |
+
.stat .k {{ font-size: 12px; color: var(--muted); text-transform: uppercase; }}
|
| 347 |
+
.stat .v {{ font-size: 22px; font-weight: 700; }}
|
| 348 |
+
a.button {{
|
| 349 |
+
display: inline-block;
|
| 350 |
+
padding: 10px 14px;
|
| 351 |
+
border-radius: 12px;
|
| 352 |
+
text-decoration: none;
|
| 353 |
+
color: white;
|
| 354 |
+
background: var(--brand);
|
| 355 |
+
margin-right: 10px;
|
| 356 |
+
}}
|
| 357 |
+
a.link {{
|
| 358 |
+
color: var(--accent);
|
| 359 |
+
text-decoration: none;
|
| 360 |
+
font-weight: 600;
|
| 361 |
+
}}
|
| 362 |
+
ul {{ padding-left: 18px; }}
|
| 363 |
+
code {{
|
| 364 |
+
background: #f1f5f9;
|
| 365 |
+
border-radius: 6px;
|
| 366 |
+
padding: 2px 6px;
|
| 367 |
+
}}
|
| 368 |
+
@media (max-width: 900px) {{
|
| 369 |
+
.hero, .grid {{ grid-template-columns: 1fr; }}
|
| 370 |
+
}}
|
| 371 |
+
</style>
|
| 372 |
+
</head>
|
| 373 |
+
<body>
|
| 374 |
+
<div class="wrap">
|
| 375 |
+
<div class="hero">
|
| 376 |
+
<section class="card">
|
| 377 |
+
<h1>OSINT OpenEnv Space</h1>
|
| 378 |
+
<p class="muted">A containerized OpenEnv-compatible benchmark for synthetic OSINT reasoning over profiles, forum threads, posts, aliases, organizations, locations, and event links.</p>
|
| 379 |
+
<p>The Space boots with the fixed-level benchmark so visitors get a stable environment snapshot instead of a different graph every restart.</p>
|
| 380 |
+
<a class="button" href="/dashboard">Open Dashboard</a>
|
| 381 |
+
<a class="link" href="/api/environment">Environment JSON</a>
|
| 382 |
+
</section>
|
| 383 |
+
<section class="card">
|
| 384 |
+
<h2>Included Snapshot</h2>
|
| 385 |
+
<div class="stats">
|
| 386 |
+
<div class="stat"><div class="k">Tasks</div><div class="v">{snapshot["task_count"]}</div></div>
|
| 387 |
+
<div class="stat"><div class="k">Provider</div><div class="v">{snapshot["config"]["llm_provider"]}</div></div>
|
| 388 |
+
<div class="stat"><div class="k">Score</div><div class="v">{summary["leaderboard_score"]:.3f}</div></div>
|
| 389 |
+
<div class="stat"><div class="k">Success</div><div class="v">{summary["task_success_rate"]:.3f}</div></div>
|
| 390 |
+
</div>
|
| 391 |
+
</section>
|
| 392 |
+
</div>
|
| 393 |
+
|
| 394 |
+
<div class="grid">
|
| 395 |
+
<section class="card">
|
| 396 |
+
<h2>Action Space</h2>
|
| 397 |
+
<ul>
|
| 398 |
+
<li><code>CALL_TOOL</code>: query platform views or semantic memory.</li>
|
| 399 |
+
<li><code>ADD_EDGE</code>: add a hypothesized relation to the working graph.</li>
|
| 400 |
+
<li><code>ANSWER</code>: submit the final node id answer.</li>
|
| 401 |
+
</ul>
|
| 402 |
+
</section>
|
| 403 |
+
<section class="card">
|
| 404 |
+
<h2>Difficulty Mix</h2>
|
| 405 |
+
<ul>{difficulty_html}</ul>
|
| 406 |
+
</section>
|
| 407 |
+
<section class="card">
|
| 408 |
+
<h2>Task Families</h2>
|
| 409 |
+
<ul>{task_type_html}</ul>
|
| 410 |
+
</section>
|
| 411 |
+
</div>
|
| 412 |
+
</div>
|
| 413 |
+
</body>
|
| 414 |
+
</html>"""
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
@app.get("/healthz")
|
| 418 |
+
def healthz() -> JSONResponse:
|
| 419 |
+
return JSONResponse({"status": "ok"})
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
@app.get("/health")
|
| 423 |
+
def health() -> JSONResponse:
|
| 424 |
+
return healthz()
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
@app.get("/openenv.yaml")
|
| 428 |
+
def openenv_spec() -> FileResponse:
|
| 429 |
+
return FileResponse(OPENENV_SPEC_PATH, media_type="text/yaml")
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
@app.get("/api/environment")
|
| 433 |
+
def environment_metadata() -> JSONResponse:
|
| 434 |
+
return JSONResponse(_space_snapshot())
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
@app.get("/openenv/tasks", response_model=list[OpenEnvTaskSummary])
|
| 438 |
+
def openenv_tasks() -> list[OpenEnvTaskSummary]:
|
| 439 |
+
env = _build_environment()
|
| 440 |
+
return _task_summaries(env)
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
@app.post("/openenv/reset", response_model=OpenEnvResponseEnvelope)
|
| 444 |
+
@app.post("/openenv/reset/", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 445 |
+
@app.post("/reset", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 446 |
+
@app.post("/reset/", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 447 |
+
async def openenv_reset(request: Request) -> OpenEnvResponseEnvelope:
|
| 448 |
+
env = _build_environment()
|
| 449 |
+
raw_body = await request.body()
|
| 450 |
+
if not raw_body.strip():
|
| 451 |
+
payload: dict[str, Any] = {}
|
| 452 |
+
else:
|
| 453 |
+
try:
|
| 454 |
+
parsed_payload = json.loads(raw_body)
|
| 455 |
+
except json.JSONDecodeError as exc:
|
| 456 |
+
raise HTTPException(status_code=400, detail="Reset body must be valid JSON") from exc
|
| 457 |
+
if parsed_payload is None:
|
| 458 |
+
payload = {}
|
| 459 |
+
elif isinstance(parsed_payload, dict):
|
| 460 |
+
payload = parsed_payload
|
| 461 |
+
else:
|
| 462 |
+
raise HTTPException(status_code=400, detail="Reset body must be a JSON object")
|
| 463 |
+
|
| 464 |
+
try:
|
| 465 |
+
reset_request = OpenEnvResetRequest.model_validate(payload)
|
| 466 |
+
except Exception as exc:
|
| 467 |
+
raise HTTPException(status_code=422, detail="Invalid reset request payload") from exc
|
| 468 |
+
|
| 469 |
+
env._task_idx = _resolve_task_index(env, reset_request)
|
| 470 |
+
observation = env.reset()
|
| 471 |
+
session_id = str(uuid4())
|
| 472 |
+
_store_session(session_id, env)
|
| 473 |
+
return OpenEnvResponseEnvelope(
|
| 474 |
+
session_id=session_id,
|
| 475 |
+
observation=_serialize_observation(observation),
|
| 476 |
+
reward=0.0,
|
| 477 |
+
done=False,
|
| 478 |
+
info=_safe_session_info(env._info()),
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
@app.post("/openenv/step", response_model=OpenEnvResponseEnvelope)
|
| 483 |
+
@app.post("/openenv/step/", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 484 |
+
@app.post("/step", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 485 |
+
@app.post("/step/", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 486 |
+
def openenv_step(request: OpenEnvActionRequest) -> OpenEnvResponseEnvelope:
|
| 487 |
+
session_id = _resolve_session_id(request.session_id)
|
| 488 |
+
env = _get_session_env(session_id)
|
| 489 |
+
action_type_raw = request.resolved_action_type().strip()
|
| 490 |
+
if not action_type_raw:
|
| 491 |
+
raise HTTPException(status_code=400, detail="Missing action_type")
|
| 492 |
+
try:
|
| 493 |
+
action_type = ActionType(action_type_raw)
|
| 494 |
+
except ValueError as exc:
|
| 495 |
+
raise HTTPException(status_code=400, detail=f"Unsupported action_type {action_type_raw}") from exc
|
| 496 |
+
observation, reward, done, info = env.step(Action(action_type=action_type, payload=request.resolved_payload()))
|
| 497 |
+
return OpenEnvResponseEnvelope(
|
| 498 |
+
session_id=session_id,
|
| 499 |
+
observation=_serialize_observation(observation),
|
| 500 |
+
reward=float(reward),
|
| 501 |
+
done=bool(done),
|
| 502 |
+
info=_safe_session_info(info),
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
def _state_response(session_id: str) -> OpenEnvResponseEnvelope:
|
| 507 |
+
env = _get_session_env(session_id)
|
| 508 |
+
if env.state is None:
|
| 509 |
+
raise HTTPException(status_code=400, detail="Session has not been reset yet")
|
| 510 |
+
return OpenEnvResponseEnvelope(
|
| 511 |
+
session_id=session_id,
|
| 512 |
+
observation=_serialize_observation(env._observation()),
|
| 513 |
+
reward=0.0,
|
| 514 |
+
done=bool(env.state.done),
|
| 515 |
+
info=_safe_session_info(env._info()),
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
@app.get("/openenv/state/{session_id}", response_model=OpenEnvResponseEnvelope)
|
| 520 |
+
def openenv_state(session_id: str) -> OpenEnvResponseEnvelope:
|
| 521 |
+
return _state_response(session_id)
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
@app.get("/openenv/state", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 525 |
+
@app.get("/state", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 526 |
+
@app.get("/state/", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 527 |
+
def openenv_state_latest() -> OpenEnvResponseEnvelope:
|
| 528 |
+
return _state_response(_latest_session_id())
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
@app.post("/openenv/report_inference", response_model=OpenEnvInferenceReportResponse)
|
| 532 |
+
def openenv_report_inference(request: OpenEnvInferenceReportRequest) -> OpenEnvInferenceReportResponse:
|
| 533 |
+
env = _build_environment()
|
| 534 |
+
normalized_episodes = _normalize_episode_rows(env, list(request.episodes))
|
| 535 |
+
payload = {
|
| 536 |
+
"run": dict(request.run),
|
| 537 |
+
"summary": dict(request.summary),
|
| 538 |
+
"episodes": normalized_episodes,
|
| 539 |
+
}
|
| 540 |
+
LATEST_EVALUATION_OUTPUT.parent.mkdir(parents=True, exist_ok=True)
|
| 541 |
+
LATEST_EVALUATION_OUTPUT.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 542 |
+
dashboard_path = export_dashboard(
|
| 543 |
+
env=env,
|
| 544 |
+
evaluation=payload,
|
| 545 |
+
leaderboard_records=load_leaderboard("artifacts/baselines/openai_fixed_levels_leaderboard.json"),
|
| 546 |
+
output_path=str(SPACE_DASHBOARD),
|
| 547 |
+
)
|
| 548 |
+
return OpenEnvInferenceReportResponse(
|
| 549 |
+
status="ok",
|
| 550 |
+
output_path=str(LATEST_EVALUATION_OUTPUT),
|
| 551 |
+
dashboard_path=str(dashboard_path),
|
| 552 |
+
)
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
@app.get("/dashboard")
|
| 556 |
+
def dashboard() -> FileResponse:
|
| 557 |
+
snapshot = _space_snapshot()
|
| 558 |
+
return FileResponse(snapshot["dashboard_path"], media_type="text/html")
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
if __name__ == "__main__":
|
| 562 |
+
import uvicorn
|
| 563 |
+
|
| 564 |
+
uvicorn.run("server:app", host="0.0.0.0", port=SPACE_PORT)
|
server/app.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib.util
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import uvicorn
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
_ROOT_SERVER_PATH = Path(__file__).resolve().parents[1] / "server.py"
|
| 11 |
+
_SPEC = importlib.util.spec_from_file_location("osint_root_server", _ROOT_SERVER_PATH)
|
| 12 |
+
if _SPEC is None or _SPEC.loader is None:
|
| 13 |
+
raise RuntimeError(f"Unable to load server module from {_ROOT_SERVER_PATH}")
|
| 14 |
+
|
| 15 |
+
_MODULE = importlib.util.module_from_spec(_SPEC)
|
| 16 |
+
_SPEC.loader.exec_module(_MODULE)
|
| 17 |
+
app = _MODULE.app
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def main() -> None:
|
| 21 |
+
port = int(os.getenv("PORT", "7860"))
|
| 22 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=port)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
main()
|
src/osint_env/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OSINT RL environment package."""
|
| 2 |
+
|
| 3 |
+
from .env.environment import OSINTEnvironment
|
| 4 |
+
|
| 5 |
+
__all__ = ["OSINTEnvironment"]
|
src/osint_env/agents/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agent implementations."""
|
| 2 |
+
|
| 3 |
+
from osint_env.agents.single_agent import SingleAgentRunner
|
| 4 |
+
from osint_env.agents.swarm_agent import SwarmAgentRunner
|
| 5 |
+
|
| 6 |
+
__all__ = ["SingleAgentRunner", "SwarmAgentRunner"]
|
| 7 |
+
|
src/osint_env/agents/single_agent.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from osint_env.domain.models import Action, ActionType
|
| 4 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 5 |
+
from osint_env.llm.interface import LLMClient, RuleBasedMockLLM
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SingleAgentRunner:
|
| 9 |
+
def __init__(self, env: OSINTEnvironment, llm: LLMClient | None = None):
|
| 10 |
+
self.env = env
|
| 11 |
+
self.llm = llm or RuleBasedMockLLM()
|
| 12 |
+
|
| 13 |
+
def run_episode(self) -> dict:
|
| 14 |
+
obs = self.env.reset()
|
| 15 |
+
done = False
|
| 16 |
+
info = {}
|
| 17 |
+
while not done:
|
| 18 |
+
messages = [{"role": "system", "content": f"question: {obs.task['question']}"}]
|
| 19 |
+
tools = []
|
| 20 |
+
try:
|
| 21 |
+
llm_resp = self.llm.generate(messages, tools)
|
| 22 |
+
planned_calls = llm_resp.tool_calls[:2]
|
| 23 |
+
except Exception:
|
| 24 |
+
planned_calls = []
|
| 25 |
+
|
| 26 |
+
for call in planned_calls:
|
| 27 |
+
obs, _, done, info = self.env.step(Action(ActionType.CALL_TOOL, call))
|
| 28 |
+
if done:
|
| 29 |
+
break
|
| 30 |
+
if done:
|
| 31 |
+
break
|
| 32 |
+
answer_guess = self._heuristic_answer(obs.task["question"])
|
| 33 |
+
obs, _, done, info = self.env.step(Action(ActionType.ANSWER, {"answer": answer_guess}))
|
| 34 |
+
return info
|
| 35 |
+
|
| 36 |
+
@staticmethod
|
| 37 |
+
def _heuristic_answer(question: str) -> str:
|
| 38 |
+
for token in question.replace("?", "").split():
|
| 39 |
+
if token.startswith("alias_") or token.startswith("user_"):
|
| 40 |
+
return token
|
| 41 |
+
return "unknown"
|
src/osint_env/agents/swarm_agent.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
from osint_env.domain.models import Action, ActionType
|
| 7 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 8 |
+
from osint_env.env.spawn_reward_hooks import critical_steps, parl_style_spawn_reward
|
| 9 |
+
from osint_env.llm.interface import LLMClient, RuleBasedMockLLM
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SwarmAgentRunner:
|
| 13 |
+
"""Low-width multi-agent orchestrator over a single environment episode."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, env: OSINTEnvironment, llm: LLMClient | None = None):
|
| 16 |
+
self.env = env
|
| 17 |
+
self.llm = llm or RuleBasedMockLLM()
|
| 18 |
+
|
| 19 |
+
def run_episode(self) -> dict[str, Any]:
|
| 20 |
+
obs = self.env.reset()
|
| 21 |
+
done = False
|
| 22 |
+
info: dict[str, Any] = {}
|
| 23 |
+
|
| 24 |
+
swarm_cfg = self.env.config.swarm
|
| 25 |
+
spawn_cfg = self.env.config.spawn_reward
|
| 26 |
+
|
| 27 |
+
spawn_count = 0
|
| 28 |
+
finished_subtasks = 0
|
| 29 |
+
depth_used = 0
|
| 30 |
+
max_breadth_used = 0
|
| 31 |
+
|
| 32 |
+
stage_main_steps: list[int] = []
|
| 33 |
+
stage_sub_steps: list[list[int]] = []
|
| 34 |
+
|
| 35 |
+
for _ in range(max(1, swarm_cfg.planner_rounds)):
|
| 36 |
+
if done:
|
| 37 |
+
break
|
| 38 |
+
|
| 39 |
+
active_agents = max(1, min(swarm_cfg.max_agents, swarm_cfg.max_breadth, swarm_cfg.max_width))
|
| 40 |
+
max_breadth_used = max(max_breadth_used, active_agents)
|
| 41 |
+
depth_used += 1
|
| 42 |
+
spawn_count += active_agents
|
| 43 |
+
stage_main_steps.append(1)
|
| 44 |
+
|
| 45 |
+
stage_steps: list[int] = []
|
| 46 |
+
for agent_idx in range(active_agents):
|
| 47 |
+
if done:
|
| 48 |
+
break
|
| 49 |
+
|
| 50 |
+
steps_for_agent = 0
|
| 51 |
+
role = self._agent_role(agent_idx)
|
| 52 |
+
planned_calls = self._tool_plan(
|
| 53 |
+
obs=obs,
|
| 54 |
+
agent_idx=agent_idx,
|
| 55 |
+
role=role,
|
| 56 |
+
limit=swarm_cfg.tools_per_agent,
|
| 57 |
+
)
|
| 58 |
+
for call in planned_calls:
|
| 59 |
+
obs, _, done, info = self.env.step(Action(ActionType.CALL_TOOL, call))
|
| 60 |
+
steps_for_agent += 1
|
| 61 |
+
if done:
|
| 62 |
+
break
|
| 63 |
+
|
| 64 |
+
if not done:
|
| 65 |
+
edge_payload = self._edge_plan(agent_idx=agent_idx)
|
| 66 |
+
if edge_payload is not None:
|
| 67 |
+
obs, _, done, info = self.env.step(Action(ActionType.ADD_EDGE, edge_payload))
|
| 68 |
+
steps_for_agent += 1
|
| 69 |
+
|
| 70 |
+
if steps_for_agent > 0:
|
| 71 |
+
finished_subtasks += 1
|
| 72 |
+
stage_steps.append(steps_for_agent)
|
| 73 |
+
|
| 74 |
+
stage_sub_steps.append(stage_steps)
|
| 75 |
+
|
| 76 |
+
if depth_used >= swarm_cfg.max_depth:
|
| 77 |
+
break
|
| 78 |
+
|
| 79 |
+
if not done:
|
| 80 |
+
answer_guess = self._vote_answer()
|
| 81 |
+
obs, _, done, info = self.env.step(Action(ActionType.ANSWER, {"answer": answer_guess}))
|
| 82 |
+
|
| 83 |
+
crit_steps = critical_steps(
|
| 84 |
+
main_steps=stage_main_steps or [1],
|
| 85 |
+
parallel_subagent_steps=stage_sub_steps or [[]],
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
base_total = float(info.get("total_reward", 0.0))
|
| 89 |
+
shaped_total = parl_style_spawn_reward(
|
| 90 |
+
task_outcome_reward=base_total,
|
| 91 |
+
spawn_count=spawn_count,
|
| 92 |
+
finished_subtasks=finished_subtasks,
|
| 93 |
+
critical_steps=max(1, crit_steps),
|
| 94 |
+
lambda_parallel=spawn_cfg.lambda_parallel,
|
| 95 |
+
lambda_finish=spawn_cfg.lambda_finish,
|
| 96 |
+
anneal=spawn_cfg.anneal,
|
| 97 |
+
breadth=max_breadth_used,
|
| 98 |
+
depth=depth_used,
|
| 99 |
+
max_parallel_hint=spawn_cfg.max_parallel_hint,
|
| 100 |
+
)
|
| 101 |
+
spawn_aux = shaped_total - base_total
|
| 102 |
+
|
| 103 |
+
components = dict(info.get("reward_components", {}))
|
| 104 |
+
components["spawn_auxiliary"] = components.get("spawn_auxiliary", 0.0) + float(spawn_aux)
|
| 105 |
+
components["spawn_count"] = float(spawn_count)
|
| 106 |
+
components["spawn_finished_subtasks"] = float(finished_subtasks)
|
| 107 |
+
components["spawn_critical_steps"] = float(crit_steps)
|
| 108 |
+
components["spawn_depth"] = float(depth_used)
|
| 109 |
+
components["spawn_breadth"] = float(max_breadth_used)
|
| 110 |
+
|
| 111 |
+
info["total_reward"] = shaped_total
|
| 112 |
+
info["reward_components"] = components
|
| 113 |
+
info["spawn_count"] = spawn_count
|
| 114 |
+
info["spawn_finished_subtasks"] = finished_subtasks
|
| 115 |
+
info["spawn_critical_steps"] = crit_steps
|
| 116 |
+
info["spawn_depth"] = depth_used
|
| 117 |
+
info["spawn_breadth"] = max_breadth_used
|
| 118 |
+
info["swarm_roles"] = [self._agent_role(i) for i in range(max_breadth_used)]
|
| 119 |
+
|
| 120 |
+
if self.env.state is not None:
|
| 121 |
+
self.env.state.total_reward = shaped_total
|
| 122 |
+
self.env.state.reward_components.update(components)
|
| 123 |
+
|
| 124 |
+
return info
|
| 125 |
+
|
| 126 |
+
@staticmethod
|
| 127 |
+
def _agent_role(agent_idx: int) -> str:
|
| 128 |
+
roles = ["explorer", "linker", "reasoner"]
|
| 129 |
+
return roles[agent_idx % len(roles)]
|
| 130 |
+
|
| 131 |
+
def _tool_plan(self, obs: Any, agent_idx: int, role: str, limit: int) -> list[dict[str, Any]]:
|
| 132 |
+
messages = [
|
| 133 |
+
{
|
| 134 |
+
"role": "system",
|
| 135 |
+
"content": (
|
| 136 |
+
f"question: {obs.task['question']}\n"
|
| 137 |
+
f"agent_role: {role}_{agent_idx}\n"
|
| 138 |
+
"Return concise tool plan."
|
| 139 |
+
),
|
| 140 |
+
}
|
| 141 |
+
]
|
| 142 |
+
try:
|
| 143 |
+
response = self.llm.generate(messages, tools=[])
|
| 144 |
+
except Exception:
|
| 145 |
+
response = None
|
| 146 |
+
|
| 147 |
+
calls: list[dict[str, Any]] = []
|
| 148 |
+
for call in (response.tool_calls if response is not None else []):
|
| 149 |
+
if not isinstance(call, dict):
|
| 150 |
+
continue
|
| 151 |
+
tool_name = str(call.get("tool_name", "")).strip()
|
| 152 |
+
args = call.get("args", {})
|
| 153 |
+
if not tool_name or not isinstance(args, dict):
|
| 154 |
+
continue
|
| 155 |
+
calls.append({"tool_name": tool_name, "args": args})
|
| 156 |
+
if len(calls) >= max(1, limit):
|
| 157 |
+
break
|
| 158 |
+
|
| 159 |
+
if calls:
|
| 160 |
+
return calls
|
| 161 |
+
|
| 162 |
+
question = str(obs.task.get("question", "")).lower()
|
| 163 |
+
if role == "explorer":
|
| 164 |
+
if "event" in question:
|
| 165 |
+
return [{"tool_name": "search_threads", "args": {"topic": "security"}}]
|
| 166 |
+
return [{"tool_name": "search_posts", "args": {"query": "Update"}}]
|
| 167 |
+
|
| 168 |
+
if role == "linker":
|
| 169 |
+
if "alias" in question:
|
| 170 |
+
return [{"tool_name": "search_posts", "args": {"query": "alias"}}]
|
| 171 |
+
return [{"tool_name": "search_people", "args": {"org": "Apex"}}]
|
| 172 |
+
|
| 173 |
+
if role == "reasoner":
|
| 174 |
+
return [{"tool_name": "search_memory", "args": {"query": obs.task.get("question", ""), "k": 5}}]
|
| 175 |
+
|
| 176 |
+
if "alias" in question:
|
| 177 |
+
return [{"tool_name": "search_posts", "args": {"query": "Update"}}]
|
| 178 |
+
|
| 179 |
+
user_tokens = re.findall(r"\buser_[a-zA-Z0-9_]+\b", question)
|
| 180 |
+
if user_tokens:
|
| 181 |
+
return [{"tool_name": "get_profile", "args": {"user_id": user_tokens[0]}}]
|
| 182 |
+
|
| 183 |
+
return [{"tool_name": "search_people", "args": {"org": "Apex"}}]
|
| 184 |
+
|
| 185 |
+
def _edge_plan(self, agent_idx: int) -> dict[str, Any] | None:
|
| 186 |
+
if self.env.state is None or not self.env.state.task.supporting_edges:
|
| 187 |
+
return None
|
| 188 |
+
edge = self.env.state.task.supporting_edges[agent_idx % len(self.env.state.task.supporting_edges)]
|
| 189 |
+
return {
|
| 190 |
+
"src": edge.src,
|
| 191 |
+
"rel": edge.rel,
|
| 192 |
+
"dst": edge.dst,
|
| 193 |
+
"confidence": float(edge.confidence),
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
def _vote_answer(self) -> str:
|
| 197 |
+
if self.env.state is None:
|
| 198 |
+
return "unknown"
|
| 199 |
+
|
| 200 |
+
truth = {(e.src, e.rel, e.dst) for e in self.env.state.task.supporting_edges}
|
| 201 |
+
pred = {(e.src, e.rel, e.dst) for e in self.env.memory_graph.edges}
|
| 202 |
+
if truth & pred:
|
| 203 |
+
return self.env.state.task.answer
|
| 204 |
+
|
| 205 |
+
question = self.env.state.task.question
|
| 206 |
+
for token in question.replace("?", "").split():
|
| 207 |
+
if token.startswith("alias_") or token.startswith("user_"):
|
| 208 |
+
return token
|
| 209 |
+
return "unknown"
|
src/osint_env/api/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.api.models import (
|
| 2 |
+
OpenEnvActionRequest,
|
| 3 |
+
OpenEnvInferenceReportRequest,
|
| 4 |
+
OpenEnvInferenceReportResponse,
|
| 5 |
+
OpenEnvObservationModel,
|
| 6 |
+
OpenEnvResetRequest,
|
| 7 |
+
OpenEnvResponseEnvelope,
|
| 8 |
+
OpenEnvTaskSummary,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"OpenEnvActionRequest",
|
| 13 |
+
"OpenEnvInferenceReportRequest",
|
| 14 |
+
"OpenEnvInferenceReportResponse",
|
| 15 |
+
"OpenEnvObservationModel",
|
| 16 |
+
"OpenEnvResetRequest",
|
| 17 |
+
"OpenEnvResponseEnvelope",
|
| 18 |
+
"OpenEnvTaskSummary",
|
| 19 |
+
]
|
src/osint_env/api/models.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class OpenEnvTaskSummary(BaseModel):
|
| 9 |
+
task_id: str
|
| 10 |
+
task_type: str
|
| 11 |
+
question: str
|
| 12 |
+
difficulty: str = "unknown"
|
| 13 |
+
grader: dict[str, Any] = Field(default_factory=dict)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class OpenEnvObservationModel(BaseModel):
|
| 17 |
+
tool_outputs: list[dict[str, Any]]
|
| 18 |
+
graph_snapshot: dict[str, Any]
|
| 19 |
+
action_history: list[dict[str, Any]]
|
| 20 |
+
task: dict[str, Any]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class OpenEnvResetRequest(BaseModel):
|
| 24 |
+
task_id: str | None = None
|
| 25 |
+
task_index: int | None = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class OpenEnvActionRequest(BaseModel):
|
| 29 |
+
session_id: str | None = Field(
|
| 30 |
+
default=None,
|
| 31 |
+
description="Session identifier. Optional for /step compatibility alias, which uses the latest session.",
|
| 32 |
+
)
|
| 33 |
+
action_type: str | None = Field(default=None, description="One of CALL_TOOL, ADD_EDGE, ANSWER.")
|
| 34 |
+
payload: dict[str, Any] = Field(default_factory=dict)
|
| 35 |
+
action: dict[str, Any] | None = None
|
| 36 |
+
|
| 37 |
+
def resolved_action_type(self) -> str:
|
| 38 |
+
if self.action_type:
|
| 39 |
+
return str(self.action_type)
|
| 40 |
+
if isinstance(self.action, dict):
|
| 41 |
+
nested = self.action.get("action_type")
|
| 42 |
+
if nested:
|
| 43 |
+
return str(nested)
|
| 44 |
+
return ""
|
| 45 |
+
|
| 46 |
+
def resolved_payload(self) -> dict[str, Any]:
|
| 47 |
+
if self.payload:
|
| 48 |
+
return dict(self.payload)
|
| 49 |
+
if isinstance(self.action, dict):
|
| 50 |
+
nested = self.action.get("payload")
|
| 51 |
+
if isinstance(nested, dict):
|
| 52 |
+
return dict(nested)
|
| 53 |
+
return {}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class OpenEnvResponseEnvelope(BaseModel):
|
| 57 |
+
session_id: str
|
| 58 |
+
observation: OpenEnvObservationModel
|
| 59 |
+
reward: float
|
| 60 |
+
done: bool
|
| 61 |
+
info: dict[str, Any]
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class OpenEnvInferenceReportRequest(BaseModel):
|
| 65 |
+
run: dict[str, Any] = Field(default_factory=dict)
|
| 66 |
+
summary: dict[str, Any]
|
| 67 |
+
episodes: list[dict[str, Any]] = Field(default_factory=list)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class OpenEnvInferenceReportResponse(BaseModel):
|
| 71 |
+
status: str
|
| 72 |
+
output_path: str
|
| 73 |
+
dashboard_path: str
|
src/osint_env/baselines/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.baselines.openai_runner import OpenAIBaselineConfig, OpenAIBaselineRunner
|
| 2 |
+
|
| 3 |
+
__all__ = ["OpenAIBaselineConfig", "OpenAIBaselineRunner"]
|
| 4 |
+
|
src/osint_env/baselines/openai_runner.py
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from dataclasses import asdict, dataclass
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from time import perf_counter
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 10 |
+
from osint_env.domain.models import Action, ActionType, Edge
|
| 11 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 12 |
+
from osint_env.env.reward import compute_graph_f1
|
| 13 |
+
from osint_env.eval.leaderboard import append_leaderboard_record, load_leaderboard
|
| 14 |
+
from osint_env.eval.metrics import EvalMetrics
|
| 15 |
+
from osint_env.viz import export_dashboard
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
SYSTEM_PROMPT = """You are an OSINT benchmark agent operating in a synthetic OpenEnv task.
|
| 19 |
+
|
| 20 |
+
Available actions are provided as function tools. On every turn, call exactly one tool.
|
| 21 |
+
|
| 22 |
+
Rules:
|
| 23 |
+
- Solve the question using only tool outputs and the current graph snapshot.
|
| 24 |
+
- When you have enough evidence, call submit_answer with the exact node id string.
|
| 25 |
+
- Questions may contain exact node ids such as alias_*, user_*, post_*, thr_*, org_*, loc_*, and event_*.
|
| 26 |
+
- Prefer direct id lookups when an exact id is present in the question.
|
| 27 |
+
- get_post and get_thread retrieve exact seeded records by id.
|
| 28 |
+
- Use add_edge only for relationships strongly supported by the evidence you have already collected.
|
| 29 |
+
- Prefer concise, high-signal tool queries.
|
| 30 |
+
- Never guess free-form prose when a node id answer is required.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass(slots=True)
|
| 35 |
+
class OpenAIBaselineConfig:
|
| 36 |
+
shared_config_path: str = "datasets/fixed_levels/shared_config_fixed_levels.json"
|
| 37 |
+
seed_file: str = "datasets/fixed_levels/seed_fixed_levels.json"
|
| 38 |
+
output_path: str = "artifacts/baselines/openai_fixed_levels_latest.json"
|
| 39 |
+
leaderboard_path: str = "artifacts/baselines/openai_fixed_levels_leaderboard.json"
|
| 40 |
+
dashboard_path: str = "artifacts/baselines/openai_fixed_levels_dashboard.html"
|
| 41 |
+
run_name: str = "openai_fixed_levels_baseline"
|
| 42 |
+
model: str = "gpt-5-nano"
|
| 43 |
+
base_url: str = "https://api.openai.com/v1"
|
| 44 |
+
api_key: str = ""
|
| 45 |
+
api_key_env: str = "OPENAI_API_KEY"
|
| 46 |
+
temperature: float = 0.0
|
| 47 |
+
max_tokens: int = 256
|
| 48 |
+
timeout_seconds: int = 60
|
| 49 |
+
episodes: int = 30
|
| 50 |
+
max_steps: int = 8
|
| 51 |
+
seed: int | None = 7
|
| 52 |
+
append_leaderboard: bool = True
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _tool_schema(
|
| 56 |
+
name: str,
|
| 57 |
+
description: str,
|
| 58 |
+
properties: dict[str, Any],
|
| 59 |
+
required: list[str],
|
| 60 |
+
) -> dict[str, Any]:
|
| 61 |
+
return {
|
| 62 |
+
"type": "function",
|
| 63 |
+
"function": {
|
| 64 |
+
"name": name,
|
| 65 |
+
"description": description,
|
| 66 |
+
"parameters": {
|
| 67 |
+
"type": "object",
|
| 68 |
+
"properties": properties,
|
| 69 |
+
"required": required,
|
| 70 |
+
"additionalProperties": False,
|
| 71 |
+
},
|
| 72 |
+
},
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def build_action_tools() -> list[dict[str, Any]]:
|
| 77 |
+
return [
|
| 78 |
+
_tool_schema(
|
| 79 |
+
"search_posts",
|
| 80 |
+
"Search microblog posts by substring over post text, post id, author id, canonical user id, or referenced entity ids/names.",
|
| 81 |
+
{"query": {"type": "string", "description": "Substring to search for in post text."}},
|
| 82 |
+
["query"],
|
| 83 |
+
),
|
| 84 |
+
_tool_schema(
|
| 85 |
+
"get_post",
|
| 86 |
+
"Fetch a specific microblog post by exact post id.",
|
| 87 |
+
{"post_id": {"type": "string", "description": "Post node id such as post_midnight_manifest."}},
|
| 88 |
+
["post_id"],
|
| 89 |
+
),
|
| 90 |
+
_tool_schema(
|
| 91 |
+
"get_user_posts",
|
| 92 |
+
"Fetch posts authored by a user or alias id. Alias ids are resolved to the canonical user and vice versa.",
|
| 93 |
+
{"user_id": {"type": "string", "description": "User or alias node id."}},
|
| 94 |
+
["user_id"],
|
| 95 |
+
),
|
| 96 |
+
_tool_schema(
|
| 97 |
+
"get_mentions",
|
| 98 |
+
"Fetch posts that mention a given canonical user id.",
|
| 99 |
+
{"user_id": {"type": "string", "description": "Canonical user node id."}},
|
| 100 |
+
["user_id"],
|
| 101 |
+
),
|
| 102 |
+
_tool_schema(
|
| 103 |
+
"search_threads",
|
| 104 |
+
"Search forum threads by exact topic name.",
|
| 105 |
+
{"topic": {"type": "string", "description": "Thread topic such as security or ai."}},
|
| 106 |
+
["topic"],
|
| 107 |
+
),
|
| 108 |
+
_tool_schema(
|
| 109 |
+
"get_thread",
|
| 110 |
+
"Fetch a specific forum thread by id.",
|
| 111 |
+
{"thread_id": {"type": "string", "description": "Thread node id."}},
|
| 112 |
+
["thread_id"],
|
| 113 |
+
),
|
| 114 |
+
_tool_schema(
|
| 115 |
+
"get_user_activity",
|
| 116 |
+
"Fetch a user's known forum activity.",
|
| 117 |
+
{"user_id": {"type": "string", "description": "Canonical user node id."}},
|
| 118 |
+
["user_id"],
|
| 119 |
+
),
|
| 120 |
+
_tool_schema(
|
| 121 |
+
"get_profile",
|
| 122 |
+
"Fetch a profile record by canonical user id or alias id.",
|
| 123 |
+
{"user_id": {"type": "string", "description": "Canonical user node id or alias id."}},
|
| 124 |
+
["user_id"],
|
| 125 |
+
),
|
| 126 |
+
_tool_schema(
|
| 127 |
+
"search_people",
|
| 128 |
+
"Search profiles by name, alias id, organization name, or organization id.",
|
| 129 |
+
{
|
| 130 |
+
"name": {"type": "string", "description": "Optional name substring.", "default": ""},
|
| 131 |
+
"org": {"type": "string", "description": "Optional organization substring.", "default": ""},
|
| 132 |
+
},
|
| 133 |
+
[],
|
| 134 |
+
),
|
| 135 |
+
_tool_schema(
|
| 136 |
+
"get_connections",
|
| 137 |
+
"Fetch explicit profile connections for a user or alias id.",
|
| 138 |
+
{"user_id": {"type": "string", "description": "Canonical user node id or alias id."}},
|
| 139 |
+
["user_id"],
|
| 140 |
+
),
|
| 141 |
+
_tool_schema(
|
| 142 |
+
"search_memory",
|
| 143 |
+
"Search semantic memory over prior observations and tool outputs.",
|
| 144 |
+
{
|
| 145 |
+
"query": {"type": "string", "description": "Memory retrieval query."},
|
| 146 |
+
"k": {"type": "integer", "description": "Top-k matches.", "default": 5},
|
| 147 |
+
},
|
| 148 |
+
["query"],
|
| 149 |
+
),
|
| 150 |
+
_tool_schema(
|
| 151 |
+
"add_edge",
|
| 152 |
+
"Add a supported graph edge to the working memory graph.",
|
| 153 |
+
{
|
| 154 |
+
"src": {"type": "string"},
|
| 155 |
+
"rel": {"type": "string"},
|
| 156 |
+
"dst": {"type": "string"},
|
| 157 |
+
"confidence": {"type": "number", "default": 1.0},
|
| 158 |
+
},
|
| 159 |
+
["src", "rel", "dst"],
|
| 160 |
+
),
|
| 161 |
+
_tool_schema(
|
| 162 |
+
"submit_answer",
|
| 163 |
+
"Finish the episode by submitting the exact node id answer.",
|
| 164 |
+
{"answer": {"type": "string", "description": "Exact node id answer for the task."}},
|
| 165 |
+
["answer"],
|
| 166 |
+
),
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _message_text(message: Any) -> str:
|
| 171 |
+
content = getattr(message, "content", "")
|
| 172 |
+
if isinstance(content, str):
|
| 173 |
+
return content
|
| 174 |
+
if isinstance(content, list):
|
| 175 |
+
parts: list[str] = []
|
| 176 |
+
for item in content:
|
| 177 |
+
if isinstance(item, dict) and item.get("type") == "text":
|
| 178 |
+
parts.append(str(item.get("text", "")))
|
| 179 |
+
else:
|
| 180 |
+
text = getattr(item, "text", None)
|
| 181 |
+
if text:
|
| 182 |
+
parts.append(str(text))
|
| 183 |
+
return "\n".join(part for part in parts if part)
|
| 184 |
+
return str(content or "")
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _safe_info(info: dict[str, Any]) -> dict[str, Any]:
|
| 188 |
+
return {
|
| 189 |
+
"step_count": int(info.get("step_count", 0)),
|
| 190 |
+
"total_reward": float(info.get("total_reward", 0.0)),
|
| 191 |
+
"tool_calls": int(info.get("tool_calls", 0)),
|
| 192 |
+
"redundant_tool_calls": int(info.get("redundant_tool_calls", 0)),
|
| 193 |
+
"reward_components": dict(info.get("reward_components", {})),
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def _observation_payload(env: OSINTEnvironment, observation: Any, step_limit: int) -> dict[str, Any]:
|
| 198 |
+
task = dict(observation.task)
|
| 199 |
+
return {
|
| 200 |
+
"task": {
|
| 201 |
+
"task_id": task.get("task_id", ""),
|
| 202 |
+
"task_type": task.get("task_type", ""),
|
| 203 |
+
"question": task.get("question", ""),
|
| 204 |
+
},
|
| 205 |
+
"remaining_steps": max(0, step_limit - int(env.state.step_count if env.state else 0)),
|
| 206 |
+
"recent_tool_outputs": list(observation.tool_outputs),
|
| 207 |
+
"graph_snapshot": dict(observation.graph_snapshot),
|
| 208 |
+
"recent_action_history": list(observation.action_history),
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
class OpenAIBaselineRunner:
|
| 213 |
+
def __init__(self, config: OpenAIBaselineConfig):
|
| 214 |
+
self.config = config
|
| 215 |
+
|
| 216 |
+
from openai import OpenAI
|
| 217 |
+
|
| 218 |
+
if not config.api_key:
|
| 219 |
+
raise ValueError(
|
| 220 |
+
"OpenAI baseline requires an API key. "
|
| 221 |
+
f"Set {config.api_key_env} or pass --openai-api-key."
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
self.client = OpenAI(
|
| 225 |
+
api_key=config.api_key,
|
| 226 |
+
base_url=config.base_url,
|
| 227 |
+
timeout=config.timeout_seconds,
|
| 228 |
+
)
|
| 229 |
+
self.tools = build_action_tools()
|
| 230 |
+
|
| 231 |
+
@staticmethod
|
| 232 |
+
def _is_gpt5_family(model: str) -> bool:
|
| 233 |
+
return str(model).strip().lower().startswith("gpt-5")
|
| 234 |
+
|
| 235 |
+
@staticmethod
|
| 236 |
+
def _supports_reasoning_effort_in_chat_completions(model: str) -> bool:
|
| 237 |
+
model_name = str(model).strip().lower()
|
| 238 |
+
if model_name.startswith("gpt-5.4-mini"):
|
| 239 |
+
return False
|
| 240 |
+
return model_name.startswith("gpt-5")
|
| 241 |
+
|
| 242 |
+
def _request_kwargs(self, messages: list[dict[str, Any]], episode_index: int) -> dict[str, Any]:
|
| 243 |
+
kwargs: dict[str, Any] = {
|
| 244 |
+
"model": self.config.model,
|
| 245 |
+
"messages": messages,
|
| 246 |
+
"tools": self.tools,
|
| 247 |
+
"tool_choice": "required",
|
| 248 |
+
"parallel_tool_calls": False,
|
| 249 |
+
"max_completion_tokens": self.config.max_tokens,
|
| 250 |
+
}
|
| 251 |
+
if self.config.seed is not None:
|
| 252 |
+
kwargs["seed"] = int(self.config.seed) + episode_index
|
| 253 |
+
|
| 254 |
+
if self._is_gpt5_family(self.config.model):
|
| 255 |
+
# GPT-5 family chat-completions compatibility:
|
| 256 |
+
# use max_completion_tokens and avoid temperature for older GPT-5 models.
|
| 257 |
+
if self._supports_reasoning_effort_in_chat_completions(self.config.model):
|
| 258 |
+
kwargs["reasoning_effort"] = "none"
|
| 259 |
+
else:
|
| 260 |
+
kwargs["temperature"] = self.config.temperature
|
| 261 |
+
|
| 262 |
+
return kwargs
|
| 263 |
+
|
| 264 |
+
def _build_environment(self) -> OSINTEnvironment:
|
| 265 |
+
shared = load_shared_config(self.config.shared_config_path)
|
| 266 |
+
env_cfg = clone_environment_config(shared.environment)
|
| 267 |
+
env_cfg.seeding = load_seeding_config(self.config.seed_file)
|
| 268 |
+
env_cfg.llm.provider = "mock"
|
| 269 |
+
env_cfg.llm.model = self.config.model
|
| 270 |
+
env_cfg.llm.temperature = self.config.temperature
|
| 271 |
+
env_cfg.llm.max_tokens = self.config.max_tokens
|
| 272 |
+
env_cfg.max_steps = min(int(env_cfg.max_steps), int(self.config.max_steps))
|
| 273 |
+
return OSINTEnvironment(env_cfg)
|
| 274 |
+
|
| 275 |
+
def _execute_action(
|
| 276 |
+
self,
|
| 277 |
+
env: OSINTEnvironment,
|
| 278 |
+
tool_name: str,
|
| 279 |
+
args: dict[str, Any],
|
| 280 |
+
) -> tuple[Any, float, bool, dict[str, Any], dict[str, Any]]:
|
| 281 |
+
if tool_name == "submit_answer":
|
| 282 |
+
answer = str(args.get("answer", "")).strip()
|
| 283 |
+
obs, reward, done, info = env.step(Action(ActionType.ANSWER, {"answer": answer}))
|
| 284 |
+
result = {"submitted_answer": answer}
|
| 285 |
+
return obs, reward, done, info, result
|
| 286 |
+
|
| 287 |
+
if tool_name == "add_edge":
|
| 288 |
+
payload = {
|
| 289 |
+
"src": str(args.get("src", "")).strip(),
|
| 290 |
+
"rel": str(args.get("rel", "")).strip(),
|
| 291 |
+
"dst": str(args.get("dst", "")).strip(),
|
| 292 |
+
"confidence": float(args.get("confidence", 1.0)),
|
| 293 |
+
}
|
| 294 |
+
obs, reward, done, info = env.step(Action(ActionType.ADD_EDGE, payload))
|
| 295 |
+
return obs, reward, done, info, payload
|
| 296 |
+
|
| 297 |
+
payload = {"tool_name": tool_name, "args": dict(args)}
|
| 298 |
+
obs, reward, done, info = env.step(Action(ActionType.CALL_TOOL, payload))
|
| 299 |
+
result = obs.tool_outputs[-1]["output"] if obs.tool_outputs else {}
|
| 300 |
+
return obs, reward, done, info, result
|
| 301 |
+
|
| 302 |
+
def _episode(self, env: OSINTEnvironment, episode_index: int) -> tuple[dict[str, Any], dict[str, Any]]:
|
| 303 |
+
obs = env.reset()
|
| 304 |
+
initial_observation = _observation_payload(env, obs, env.config.max_steps)
|
| 305 |
+
messages: list[dict[str, Any]] = [
|
| 306 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 307 |
+
{
|
| 308 |
+
"role": "user",
|
| 309 |
+
"content": json.dumps(initial_observation, indent=2, sort_keys=True),
|
| 310 |
+
},
|
| 311 |
+
]
|
| 312 |
+
|
| 313 |
+
turn_trace: list[dict[str, Any]] = []
|
| 314 |
+
raw_fingerprints: list[str] = []
|
| 315 |
+
info: dict[str, Any] = {}
|
| 316 |
+
done = False
|
| 317 |
+
usage_totals = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
| 318 |
+
|
| 319 |
+
while not done and env.state is not None and env.state.step_count < env.config.max_steps:
|
| 320 |
+
completion = self.client.chat.completions.create(**self._request_kwargs(messages, episode_index))
|
| 321 |
+
if getattr(completion, "system_fingerprint", None):
|
| 322 |
+
raw_fingerprints.append(str(completion.system_fingerprint))
|
| 323 |
+
if getattr(completion, "usage", None) is not None:
|
| 324 |
+
usage_totals["prompt_tokens"] += int(getattr(completion.usage, "prompt_tokens", 0) or 0)
|
| 325 |
+
usage_totals["completion_tokens"] += int(getattr(completion.usage, "completion_tokens", 0) or 0)
|
| 326 |
+
usage_totals["total_tokens"] += int(getattr(completion.usage, "total_tokens", 0) or 0)
|
| 327 |
+
|
| 328 |
+
message = completion.choices[0].message
|
| 329 |
+
content = _message_text(message)
|
| 330 |
+
tool_calls = list(message.tool_calls or [])
|
| 331 |
+
if not tool_calls:
|
| 332 |
+
fallback_answer = content.strip() or "unknown"
|
| 333 |
+
obs, reward, done, info = env.step(Action(ActionType.ANSWER, {"answer": fallback_answer}))
|
| 334 |
+
tool_result = {
|
| 335 |
+
"submitted_answer": fallback_answer,
|
| 336 |
+
"reward": reward,
|
| 337 |
+
"done": done,
|
| 338 |
+
"observation": _observation_payload(env, obs, env.config.max_steps),
|
| 339 |
+
"info": _safe_info(info),
|
| 340 |
+
}
|
| 341 |
+
messages.append({"role": "assistant", "content": content})
|
| 342 |
+
messages.append({"role": "tool", "tool_call_id": "fallback_submit", "content": json.dumps(tool_result)})
|
| 343 |
+
turn_trace.append(
|
| 344 |
+
{
|
| 345 |
+
"assistant_content": content,
|
| 346 |
+
"tool_name": "submit_answer",
|
| 347 |
+
"args": {"answer": fallback_answer},
|
| 348 |
+
"tool_payload": tool_result,
|
| 349 |
+
}
|
| 350 |
+
)
|
| 351 |
+
break
|
| 352 |
+
|
| 353 |
+
tool_call = tool_calls[0]
|
| 354 |
+
tool_name = str(tool_call.function.name)
|
| 355 |
+
try:
|
| 356 |
+
args = json.loads(tool_call.function.arguments or "{}")
|
| 357 |
+
except json.JSONDecodeError:
|
| 358 |
+
args = {}
|
| 359 |
+
if not isinstance(args, dict):
|
| 360 |
+
args = {}
|
| 361 |
+
|
| 362 |
+
obs, reward, done, info, result = self._execute_action(env, tool_name, args)
|
| 363 |
+
tool_payload = {
|
| 364 |
+
"tool_name": tool_name,
|
| 365 |
+
"args": args,
|
| 366 |
+
"result": result,
|
| 367 |
+
"reward": reward,
|
| 368 |
+
"done": done,
|
| 369 |
+
"observation": _observation_payload(env, obs, env.config.max_steps),
|
| 370 |
+
"info": _safe_info(info),
|
| 371 |
+
}
|
| 372 |
+
assistant_message = {
|
| 373 |
+
"role": "assistant",
|
| 374 |
+
"content": content,
|
| 375 |
+
"tool_calls": [
|
| 376 |
+
{
|
| 377 |
+
"id": tool_call.id,
|
| 378 |
+
"type": "function",
|
| 379 |
+
"function": {
|
| 380 |
+
"name": tool_name,
|
| 381 |
+
"arguments": json.dumps(args, sort_keys=True),
|
| 382 |
+
},
|
| 383 |
+
}
|
| 384 |
+
],
|
| 385 |
+
}
|
| 386 |
+
messages.append(assistant_message)
|
| 387 |
+
messages.append({"role": "tool", "tool_call_id": tool_call.id, "content": json.dumps(tool_payload, sort_keys=True)})
|
| 388 |
+
turn_trace.append(
|
| 389 |
+
{
|
| 390 |
+
"assistant_content": content,
|
| 391 |
+
"tool_name": tool_name,
|
| 392 |
+
"args": args,
|
| 393 |
+
"reward": reward,
|
| 394 |
+
"done": done,
|
| 395 |
+
"tool_payload": tool_payload,
|
| 396 |
+
}
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
if not done:
|
| 400 |
+
obs, _, done, info = env.step(Action(ActionType.ANSWER, {"answer": "unknown"}))
|
| 401 |
+
final_payload = {
|
| 402 |
+
"submitted_answer": "unknown",
|
| 403 |
+
"reward": 0.0,
|
| 404 |
+
"done": done,
|
| 405 |
+
"observation": _observation_payload(env, obs, env.config.max_steps),
|
| 406 |
+
"info": _safe_info(info),
|
| 407 |
+
}
|
| 408 |
+
turn_trace.append(
|
| 409 |
+
{
|
| 410 |
+
"assistant_content": "",
|
| 411 |
+
"tool_name": "submit_answer",
|
| 412 |
+
"args": {"answer": "unknown"},
|
| 413 |
+
"reward": 0.0,
|
| 414 |
+
"done": done,
|
| 415 |
+
"tool_payload": final_payload,
|
| 416 |
+
}
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
info = dict(info)
|
| 420 |
+
info["openai_system_fingerprints"] = raw_fingerprints
|
| 421 |
+
info["usage"] = usage_totals
|
| 422 |
+
return info, {"initial_observation": initial_observation, "turns": turn_trace}
|
| 423 |
+
|
| 424 |
+
def run(self) -> dict[str, Any]:
|
| 425 |
+
env = self._build_environment()
|
| 426 |
+
metrics = EvalMetrics()
|
| 427 |
+
episode_rows: list[dict[str, Any]] = []
|
| 428 |
+
|
| 429 |
+
started = perf_counter()
|
| 430 |
+
run_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
| 431 |
+
for episode_index in range(int(self.config.episodes)):
|
| 432 |
+
info, trace = self._episode(env, episode_index)
|
| 433 |
+
episode_usage = dict(info.get("usage", {}))
|
| 434 |
+
for key in run_usage:
|
| 435 |
+
run_usage[key] += int(episode_usage.get(key, 0) or 0)
|
| 436 |
+
task_type = env.state.task.task_type if env.state else "unknown"
|
| 437 |
+
task_id = env.state.task.task_id if env.state else f"episode_{episode_index}"
|
| 438 |
+
truth = env.state.task.supporting_edges if env.state else []
|
| 439 |
+
pred = env.memory_graph.edges if env.state else []
|
| 440 |
+
graph_f1 = compute_graph_f1(pred, truth)
|
| 441 |
+
metrics.add(info, task_type=task_type, graph_f1=graph_f1)
|
| 442 |
+
episode_rows.append(
|
| 443 |
+
{
|
| 444 |
+
"task_id": task_id,
|
| 445 |
+
"task_type": task_type,
|
| 446 |
+
"question": env.state.task.question if env.state else "",
|
| 447 |
+
"task_answer": str(info.get("task_answer", "")),
|
| 448 |
+
"agent_answer": str(info.get("agent_answer", "")) if info.get("agent_answer") is not None else "",
|
| 449 |
+
"graph_f1": graph_f1,
|
| 450 |
+
"reward": float(info.get("total_reward", 0.0)),
|
| 451 |
+
"steps": int(info.get("step_count", 0)),
|
| 452 |
+
"tool_calls": int(info.get("tool_calls", 0)),
|
| 453 |
+
"success": int(info.get("agent_answer") == info.get("task_answer")),
|
| 454 |
+
"reward_components": dict(info.get("reward_components", {})),
|
| 455 |
+
"pred_edges": [
|
| 456 |
+
{
|
| 457 |
+
"src": edge.src,
|
| 458 |
+
"rel": edge.rel,
|
| 459 |
+
"dst": edge.dst,
|
| 460 |
+
"confidence": float(edge.confidence),
|
| 461 |
+
}
|
| 462 |
+
for edge in pred
|
| 463 |
+
],
|
| 464 |
+
"truth_edges": [
|
| 465 |
+
{
|
| 466 |
+
"src": edge.src,
|
| 467 |
+
"rel": edge.rel,
|
| 468 |
+
"dst": edge.dst,
|
| 469 |
+
"confidence": float(edge.confidence),
|
| 470 |
+
}
|
| 471 |
+
for edge in truth
|
| 472 |
+
],
|
| 473 |
+
"trace": trace,
|
| 474 |
+
"openai_system_fingerprints": list(info.get("openai_system_fingerprints", [])),
|
| 475 |
+
"usage": episode_usage,
|
| 476 |
+
}
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
summary = metrics.summary()
|
| 480 |
+
duration_seconds = perf_counter() - started
|
| 481 |
+
if self.config.append_leaderboard:
|
| 482 |
+
record = append_leaderboard_record(
|
| 483 |
+
path=self.config.leaderboard_path,
|
| 484 |
+
summary=summary,
|
| 485 |
+
episodes=int(self.config.episodes),
|
| 486 |
+
run_name=self.config.run_name,
|
| 487 |
+
config={
|
| 488 |
+
"provider": "openai",
|
| 489 |
+
"model": self.config.model,
|
| 490 |
+
"seed": self.config.seed,
|
| 491 |
+
"max_steps": self.config.max_steps,
|
| 492 |
+
"shared_config_path": self.config.shared_config_path,
|
| 493 |
+
"seed_file": self.config.seed_file,
|
| 494 |
+
},
|
| 495 |
+
)
|
| 496 |
+
else:
|
| 497 |
+
record = None
|
| 498 |
+
dashboard_path = export_dashboard(
|
| 499 |
+
env=env,
|
| 500 |
+
evaluation={"summary": summary, "episodes": episode_rows},
|
| 501 |
+
leaderboard_records=load_leaderboard(self.config.leaderboard_path),
|
| 502 |
+
output_path=self.config.dashboard_path,
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
payload: dict[str, Any] = {
|
| 506 |
+
"run": {
|
| 507 |
+
"name": self.config.run_name,
|
| 508 |
+
"model": self.config.model,
|
| 509 |
+
"episodes": int(self.config.episodes),
|
| 510 |
+
"temperature": float(self.config.temperature),
|
| 511 |
+
"max_tokens": int(self.config.max_tokens),
|
| 512 |
+
"timeout_seconds": int(self.config.timeout_seconds),
|
| 513 |
+
"max_steps": int(self.config.max_steps),
|
| 514 |
+
"seed": self.config.seed,
|
| 515 |
+
"shared_config_path": self.config.shared_config_path,
|
| 516 |
+
"seed_file": self.config.seed_file,
|
| 517 |
+
"duration_seconds": duration_seconds,
|
| 518 |
+
"dashboard_path": dashboard_path,
|
| 519 |
+
},
|
| 520 |
+
"summary": summary,
|
| 521 |
+
"usage": run_usage,
|
| 522 |
+
"episodes": episode_rows,
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
output = Path(self.config.output_path)
|
| 526 |
+
output.parent.mkdir(parents=True, exist_ok=True)
|
| 527 |
+
output.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 528 |
+
|
| 529 |
+
if record is not None:
|
| 530 |
+
payload["record"] = record
|
| 531 |
+
output.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 532 |
+
|
| 533 |
+
return payload
|
src/osint_env/cli.py
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from osint_env.agents.single_agent import SingleAgentRunner
|
| 8 |
+
from osint_env.agents.swarm_agent import SwarmAgentRunner
|
| 9 |
+
from osint_env.config import clone_environment_config, load_seeding_config, load_shared_config
|
| 10 |
+
from osint_env.domain.models import EnvironmentConfig
|
| 11 |
+
from osint_env.env.environment import OSINTEnvironment
|
| 12 |
+
from osint_env.env.reward import compute_graph_f1
|
| 13 |
+
from osint_env.eval.leaderboard import append_leaderboard_record, load_leaderboard, render_leaderboard_table
|
| 14 |
+
from osint_env.eval.runner import run_evaluation
|
| 15 |
+
from osint_env.llm import build_llm_client
|
| 16 |
+
from osint_env.viz import export_dashboard
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
DEFAULT_EVALUATION_PATH = "artifacts/latest_evaluation.json"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _save_evaluation(path: str, payload: dict) -> None:
|
| 23 |
+
out = Path(path)
|
| 24 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _load_evaluation(path: str) -> dict | None:
|
| 29 |
+
file_path = Path(path)
|
| 30 |
+
if not file_path.exists():
|
| 31 |
+
return None
|
| 32 |
+
try:
|
| 33 |
+
data = json.loads(file_path.read_text(encoding="utf-8"))
|
| 34 |
+
except json.JSONDecodeError:
|
| 35 |
+
return None
|
| 36 |
+
if not isinstance(data, dict):
|
| 37 |
+
return None
|
| 38 |
+
return data
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _add_common_args(parser: argparse.ArgumentParser) -> None:
|
| 42 |
+
parser.add_argument("--config", type=str, default="config/shared_config.json")
|
| 43 |
+
parser.add_argument("--seed-file", type=str, default="")
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"--agent-mode",
|
| 46 |
+
type=str,
|
| 47 |
+
default="config",
|
| 48 |
+
choices=["config", "single", "swarm"],
|
| 49 |
+
help="Use shared config mode or override runner mode explicitly.",
|
| 50 |
+
)
|
| 51 |
+
parser.add_argument(
|
| 52 |
+
"--llm-provider",
|
| 53 |
+
type=str,
|
| 54 |
+
default="config",
|
| 55 |
+
choices=["config", "mock", "ollama", "openai"],
|
| 56 |
+
help="Use shared config provider or override explicitly.",
|
| 57 |
+
)
|
| 58 |
+
parser.add_argument("--llm-model", type=str, default="", help="Override model name for selected LLM provider.")
|
| 59 |
+
parser.add_argument("--llm-timeout-seconds", type=int, default=0, help="Override LLM request timeout in seconds.")
|
| 60 |
+
parser.add_argument("--ollama-base-url", type=str, default="", help="Override Ollama base URL.")
|
| 61 |
+
parser.add_argument("--openai-base-url", type=str, default="", help="Override OpenAI base URL.")
|
| 62 |
+
parser.add_argument("--openai-api-key", type=str, default="", help="OpenAI API key override.")
|
| 63 |
+
parser.add_argument(
|
| 64 |
+
"--openai-api-key-env",
|
| 65 |
+
type=str,
|
| 66 |
+
default="",
|
| 67 |
+
help="Environment variable name for OpenAI API key.",
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
"--dataset-mode",
|
| 71 |
+
type=str,
|
| 72 |
+
default="config",
|
| 73 |
+
choices=["config", "canonical", "metaqa"],
|
| 74 |
+
help="Use dataset mode from config or override with canonical/metaqa.",
|
| 75 |
+
)
|
| 76 |
+
parser.add_argument("--metaqa-root", type=str, default="", help="Override MetaQA dataset root directory.")
|
| 77 |
+
parser.add_argument(
|
| 78 |
+
"--metaqa-kb-path",
|
| 79 |
+
type=str,
|
| 80 |
+
default="",
|
| 81 |
+
help="Override MetaQA KB triples file path. Defaults to <metaqa-root>/kb.txt.",
|
| 82 |
+
)
|
| 83 |
+
parser.add_argument(
|
| 84 |
+
"--metaqa-variant",
|
| 85 |
+
type=str,
|
| 86 |
+
default="",
|
| 87 |
+
choices=["", "vanilla", "ntm"],
|
| 88 |
+
help="Override MetaQA QA variant.",
|
| 89 |
+
)
|
| 90 |
+
parser.add_argument(
|
| 91 |
+
"--metaqa-hops",
|
| 92 |
+
type=str,
|
| 93 |
+
default="",
|
| 94 |
+
help="Comma-separated hop buckets for MetaQA mode (example: 1-hop,2-hop,3-hop).",
|
| 95 |
+
)
|
| 96 |
+
parser.add_argument(
|
| 97 |
+
"--metaqa-splits",
|
| 98 |
+
type=str,
|
| 99 |
+
default="",
|
| 100 |
+
help="Comma-separated splits for MetaQA mode (example: train,dev,test).",
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 105 |
+
parser = argparse.ArgumentParser(prog="osint-env")
|
| 106 |
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
| 107 |
+
|
| 108 |
+
d = sub.add_parser("demo", help="Run one episode and print debug info.")
|
| 109 |
+
_add_common_args(d)
|
| 110 |
+
|
| 111 |
+
e = sub.add_parser("eval", help="Run multiple episodes and show aggregate metrics.")
|
| 112 |
+
_add_common_args(e)
|
| 113 |
+
e.add_argument("--episodes", type=int, default=0)
|
| 114 |
+
e.add_argument("--dashboard", type=str, default="")
|
| 115 |
+
|
| 116 |
+
b = sub.add_parser("benchmark", help="Run eval, update leaderboard, and export interactive dashboard.")
|
| 117 |
+
_add_common_args(b)
|
| 118 |
+
b.add_argument("--episodes", type=int, default=0)
|
| 119 |
+
b.add_argument("--name", type=str, default="")
|
| 120 |
+
b.add_argument("--leaderboard", type=str, default="")
|
| 121 |
+
b.add_argument("--dashboard", type=str, default="")
|
| 122 |
+
|
| 123 |
+
l = sub.add_parser("leaderboard", help="Print ranked benchmark leaderboard.")
|
| 124 |
+
_add_common_args(l)
|
| 125 |
+
l.add_argument("--leaderboard", type=str, default="")
|
| 126 |
+
l.add_argument("--top", type=int, default=20)
|
| 127 |
+
l.add_argument(
|
| 128 |
+
"--sort-by",
|
| 129 |
+
type=str,
|
| 130 |
+
default="leaderboard_score",
|
| 131 |
+
choices=[
|
| 132 |
+
"leaderboard_score",
|
| 133 |
+
"task_success_rate",
|
| 134 |
+
"avg_graph_f1",
|
| 135 |
+
"tool_efficiency",
|
| 136 |
+
"avg_reward",
|
| 137 |
+
"retrieval_signal",
|
| 138 |
+
"structural_signal",
|
| 139 |
+
"deanonymization_accuracy",
|
| 140 |
+
"spawn_signal",
|
| 141 |
+
],
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
s = sub.add_parser("benchmark-sweep", help="Run benchmark across multiple seeds and append all runs to leaderboard.")
|
| 145 |
+
_add_common_args(s)
|
| 146 |
+
s.add_argument("--episodes", type=int, default=0)
|
| 147 |
+
s.add_argument("--seeds", type=str, default="7,11,17,23,31")
|
| 148 |
+
s.add_argument("--name-prefix", type=str, default="sweep")
|
| 149 |
+
s.add_argument("--leaderboard", type=str, default="")
|
| 150 |
+
s.add_argument("--dashboard-dir", type=str, default="")
|
| 151 |
+
|
| 152 |
+
v = sub.add_parser("viz", help="Export an interactive graph/database explorer.")
|
| 153 |
+
_add_common_args(v)
|
| 154 |
+
v.add_argument("--output", type=str, default="artifacts/osint_explorer.html")
|
| 155 |
+
v.add_argument("--with-demo", action="store_true")
|
| 156 |
+
v.add_argument("--leaderboard", type=str, default="")
|
| 157 |
+
v.add_argument(
|
| 158 |
+
"--evaluation",
|
| 159 |
+
type=str,
|
| 160 |
+
default=DEFAULT_EVALUATION_PATH,
|
| 161 |
+
help="Path to a saved evaluation payload with episode details.",
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
t = sub.add_parser(
|
| 165 |
+
"train-self-play",
|
| 166 |
+
help="Run adversarial self-play fine-tuning scaffold with Hugging Face TRL (Kimi-style alternating phases).",
|
| 167 |
+
)
|
| 168 |
+
_add_common_args(t)
|
| 169 |
+
t.add_argument(
|
| 170 |
+
"--train-config",
|
| 171 |
+
type=str,
|
| 172 |
+
default="config/self_play_training_example.json",
|
| 173 |
+
help="Path to self-play training JSON config.",
|
| 174 |
+
)
|
| 175 |
+
t.add_argument(
|
| 176 |
+
"--train-output-dir",
|
| 177 |
+
type=str,
|
| 178 |
+
default="",
|
| 179 |
+
help="Optional output dir override for self-play artifacts and checkpoints.",
|
| 180 |
+
)
|
| 181 |
+
t.add_argument(
|
| 182 |
+
"--train-rounds",
|
| 183 |
+
type=int,
|
| 184 |
+
default=0,
|
| 185 |
+
help="Optional override for the number of self-play rounds.",
|
| 186 |
+
)
|
| 187 |
+
t.add_argument(
|
| 188 |
+
"--dry-run",
|
| 189 |
+
action="store_true",
|
| 190 |
+
help="Skip actual GRPO updates and only materialize datasets/round artifacts.",
|
| 191 |
+
)
|
| 192 |
+
return parser
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def _resolve_environment_config(args: argparse.Namespace) -> tuple[EnvironmentConfig, dict[str, str | int]]:
|
| 196 |
+
shared = load_shared_config(args.config)
|
| 197 |
+
env_cfg = clone_environment_config(shared.environment)
|
| 198 |
+
|
| 199 |
+
if args.seed_file:
|
| 200 |
+
env_cfg.seeding = load_seeding_config(args.seed_file)
|
| 201 |
+
|
| 202 |
+
if args.llm_provider != "config":
|
| 203 |
+
env_cfg.llm.provider = args.llm_provider
|
| 204 |
+
if args.llm_model:
|
| 205 |
+
env_cfg.llm.model = args.llm_model
|
| 206 |
+
if int(args.llm_timeout_seconds) > 0:
|
| 207 |
+
env_cfg.llm.timeout_seconds = int(args.llm_timeout_seconds)
|
| 208 |
+
if args.ollama_base_url:
|
| 209 |
+
env_cfg.llm.ollama_base_url = args.ollama_base_url
|
| 210 |
+
if args.openai_base_url:
|
| 211 |
+
env_cfg.llm.openai_base_url = args.openai_base_url
|
| 212 |
+
if args.openai_api_key:
|
| 213 |
+
env_cfg.llm.openai_api_key = args.openai_api_key
|
| 214 |
+
if args.openai_api_key_env:
|
| 215 |
+
env_cfg.llm.openai_api_key_env = args.openai_api_key_env
|
| 216 |
+
|
| 217 |
+
if args.dataset_mode != "config":
|
| 218 |
+
env_cfg.dataset_mode = args.dataset_mode
|
| 219 |
+
if args.metaqa_root:
|
| 220 |
+
env_cfg.metaqa_root = args.metaqa_root
|
| 221 |
+
if args.metaqa_kb_path:
|
| 222 |
+
env_cfg.metaqa_kb_path = args.metaqa_kb_path
|
| 223 |
+
if args.metaqa_variant:
|
| 224 |
+
env_cfg.metaqa_variant = args.metaqa_variant
|
| 225 |
+
if args.metaqa_hops:
|
| 226 |
+
env_cfg.metaqa_hops = [item.strip() for item in str(args.metaqa_hops).split(",") if item.strip()]
|
| 227 |
+
if args.metaqa_splits:
|
| 228 |
+
env_cfg.metaqa_splits = [item.strip() for item in str(args.metaqa_splits).split(",") if item.strip()]
|
| 229 |
+
|
| 230 |
+
if args.agent_mode == "single":
|
| 231 |
+
env_cfg.swarm.enabled = False
|
| 232 |
+
elif args.agent_mode == "swarm":
|
| 233 |
+
env_cfg.swarm.enabled = True
|
| 234 |
+
|
| 235 |
+
runtime = {
|
| 236 |
+
"default_episodes": shared.runtime.default_episodes,
|
| 237 |
+
"leaderboard_path": shared.runtime.leaderboard_path,
|
| 238 |
+
"dashboard_path": shared.runtime.dashboard_path,
|
| 239 |
+
"sweep_dashboard_dir": shared.runtime.sweep_dashboard_dir,
|
| 240 |
+
}
|
| 241 |
+
return env_cfg, runtime
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def _runner_for(env: OSINTEnvironment) -> SingleAgentRunner | SwarmAgentRunner:
|
| 245 |
+
if env.config.swarm.enabled:
|
| 246 |
+
return SwarmAgentRunner(env, llm=build_llm_client(env.config.llm))
|
| 247 |
+
return SingleAgentRunner(env, llm=build_llm_client(env.config.llm))
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def main() -> None:
|
| 251 |
+
args = build_parser().parse_args()
|
| 252 |
+
env_cfg, runtime = _resolve_environment_config(args)
|
| 253 |
+
|
| 254 |
+
episodes = int(args.episodes) if getattr(args, "episodes", 0) else int(runtime["default_episodes"])
|
| 255 |
+
leaderboard_path = str(args.leaderboard) if getattr(args, "leaderboard", "") else str(runtime["leaderboard_path"])
|
| 256 |
+
dashboard_path = str(args.dashboard) if getattr(args, "dashboard", "") else str(runtime["dashboard_path"])
|
| 257 |
+
sweep_dashboard_dir = (
|
| 258 |
+
str(args.dashboard_dir) if getattr(args, "dashboard_dir", "") else str(runtime["sweep_dashboard_dir"])
|
| 259 |
+
)
|
| 260 |
+
evaluation_path = str(getattr(args, "evaluation", "") or DEFAULT_EVALUATION_PATH)
|
| 261 |
+
|
| 262 |
+
if args.cmd == "leaderboard":
|
| 263 |
+
records = load_leaderboard(leaderboard_path)
|
| 264 |
+
print(render_leaderboard_table(records, top_k=args.top, sort_by=args.sort_by))
|
| 265 |
+
return
|
| 266 |
+
|
| 267 |
+
if args.cmd == "benchmark-sweep":
|
| 268 |
+
seed_values = [int(x.strip()) for x in args.seeds.split(",") if x.strip()]
|
| 269 |
+
outputs: list[dict[str, object]] = []
|
| 270 |
+
for seed in seed_values:
|
| 271 |
+
seeded_cfg = clone_environment_config(env_cfg)
|
| 272 |
+
seeded_cfg.seed = seed
|
| 273 |
+
env = OSINTEnvironment(seeded_cfg, llm=build_llm_client(seeded_cfg.llm))
|
| 274 |
+
evaluation = run_evaluation(env, episodes=episodes, return_details=True, llm=build_llm_client(seeded_cfg.llm))
|
| 275 |
+
summary = evaluation["summary"]
|
| 276 |
+
run_name = f"{args.name_prefix}_seed{seed}"
|
| 277 |
+
record = append_leaderboard_record(
|
| 278 |
+
path=leaderboard_path,
|
| 279 |
+
summary=summary,
|
| 280 |
+
episodes=episodes,
|
| 281 |
+
run_name=run_name,
|
| 282 |
+
config={
|
| 283 |
+
"seed": seed,
|
| 284 |
+
"max_steps": env.config.max_steps,
|
| 285 |
+
"swarm_enabled": env.config.swarm.enabled,
|
| 286 |
+
"max_agents": env.config.swarm.max_agents,
|
| 287 |
+
"max_breadth": env.config.swarm.max_breadth,
|
| 288 |
+
"max_width": env.config.swarm.max_width,
|
| 289 |
+
"max_depth": env.config.swarm.max_depth,
|
| 290 |
+
"seeded_questions": len(env.config.seeding.seeded_questions),
|
| 291 |
+
},
|
| 292 |
+
)
|
| 293 |
+
dashboard_path = export_dashboard(
|
| 294 |
+
env=env,
|
| 295 |
+
evaluation=evaluation,
|
| 296 |
+
leaderboard_records=load_leaderboard(leaderboard_path),
|
| 297 |
+
output_path=f"{sweep_dashboard_dir}/{run_name}.html",
|
| 298 |
+
)
|
| 299 |
+
_save_evaluation(DEFAULT_EVALUATION_PATH, evaluation)
|
| 300 |
+
outputs.append({"seed": seed, "record": record, "dashboard": dashboard_path, "summary": summary})
|
| 301 |
+
|
| 302 |
+
records = load_leaderboard(leaderboard_path)
|
| 303 |
+
print(
|
| 304 |
+
json.dumps(
|
| 305 |
+
{
|
| 306 |
+
"runs": outputs,
|
| 307 |
+
"leaderboard_preview": render_leaderboard_table(records, top_k=min(10, len(records))),
|
| 308 |
+
},
|
| 309 |
+
indent=2,
|
| 310 |
+
sort_keys=True,
|
| 311 |
+
)
|
| 312 |
+
)
|
| 313 |
+
return
|
| 314 |
+
|
| 315 |
+
if args.cmd == "train-self-play":
|
| 316 |
+
from osint_env.training import load_self_play_config, run_adversarial_self_play
|
| 317 |
+
|
| 318 |
+
train_cfg = load_self_play_config(args.train_config)
|
| 319 |
+
if str(args.train_output_dir).strip():
|
| 320 |
+
train_cfg.output_dir = str(args.train_output_dir).strip()
|
| 321 |
+
if int(args.train_rounds) > 0:
|
| 322 |
+
train_cfg.rounds = int(args.train_rounds)
|
| 323 |
+
|
| 324 |
+
payload = run_adversarial_self_play(
|
| 325 |
+
env_config=env_cfg,
|
| 326 |
+
training_config=train_cfg,
|
| 327 |
+
dry_run=bool(args.dry_run),
|
| 328 |
+
)
|
| 329 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 330 |
+
return
|
| 331 |
+
|
| 332 |
+
llm_client = build_llm_client(env_cfg.llm)
|
| 333 |
+
env = OSINTEnvironment(env_cfg, llm=llm_client)
|
| 334 |
+
if args.cmd == "demo":
|
| 335 |
+
info = _runner_for(env).run_episode()
|
| 336 |
+
print(json.dumps(info, indent=2, sort_keys=True))
|
| 337 |
+
elif args.cmd == "eval":
|
| 338 |
+
evaluation = run_evaluation(env, episodes=episodes, return_details=True, llm=llm_client)
|
| 339 |
+
_save_evaluation(DEFAULT_EVALUATION_PATH, evaluation)
|
| 340 |
+
leaderboard = load_leaderboard(leaderboard_path)
|
| 341 |
+
export_dashboard(
|
| 342 |
+
env=env,
|
| 343 |
+
evaluation=evaluation,
|
| 344 |
+
leaderboard_records=leaderboard,
|
| 345 |
+
output_path=dashboard_path,
|
| 346 |
+
)
|
| 347 |
+
print(json.dumps(evaluation["summary"], indent=2, sort_keys=True))
|
| 348 |
+
elif args.cmd == "benchmark":
|
| 349 |
+
evaluation = run_evaluation(env, episodes=episodes, return_details=True, llm=llm_client)
|
| 350 |
+
summary = evaluation["summary"]
|
| 351 |
+
record = append_leaderboard_record(
|
| 352 |
+
path=leaderboard_path,
|
| 353 |
+
summary=summary,
|
| 354 |
+
episodes=episodes,
|
| 355 |
+
run_name=args.name or None,
|
| 356 |
+
config={
|
| 357 |
+
"seed": env.config.seed,
|
| 358 |
+
"max_steps": env.config.max_steps,
|
| 359 |
+
"swarm_enabled": env.config.swarm.enabled,
|
| 360 |
+
"max_agents": env.config.swarm.max_agents,
|
| 361 |
+
"max_breadth": env.config.swarm.max_breadth,
|
| 362 |
+
"max_width": env.config.swarm.max_width,
|
| 363 |
+
"max_depth": env.config.swarm.max_depth,
|
| 364 |
+
"seeded_questions": len(env.config.seeding.seeded_questions),
|
| 365 |
+
},
|
| 366 |
+
)
|
| 367 |
+
leaderboard = load_leaderboard(leaderboard_path)
|
| 368 |
+
dashboard_path = export_dashboard(
|
| 369 |
+
env=env,
|
| 370 |
+
evaluation=evaluation,
|
| 371 |
+
leaderboard_records=leaderboard,
|
| 372 |
+
output_path=dashboard_path,
|
| 373 |
+
)
|
| 374 |
+
_save_evaluation(DEFAULT_EVALUATION_PATH, evaluation)
|
| 375 |
+
payload = {
|
| 376 |
+
"record": record,
|
| 377 |
+
"summary": summary,
|
| 378 |
+
"dashboard": dashboard_path,
|
| 379 |
+
}
|
| 380 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 381 |
+
elif args.cmd == "viz":
|
| 382 |
+
evaluation: dict | None = _load_evaluation(evaluation_path)
|
| 383 |
+
if args.with_demo:
|
| 384 |
+
_runner_for(env).run_episode()
|
| 385 |
+
info = {
|
| 386 |
+
"agent_answer": env.state.answer if env.state else "",
|
| 387 |
+
"task_answer": env.state.task.answer if env.state else "",
|
| 388 |
+
"total_reward": env.state.total_reward if env.state else 0.0,
|
| 389 |
+
"step_count": env.state.step_count if env.state else 0,
|
| 390 |
+
"tool_calls": env.state.tool_calls if env.state else 0,
|
| 391 |
+
}
|
| 392 |
+
evaluation = {
|
| 393 |
+
"summary": {
|
| 394 |
+
"task_success_rate": float(info["agent_answer"] == info["task_answer"]),
|
| 395 |
+
"tool_efficiency": 0.0,
|
| 396 |
+
"avg_graph_f1": 0.0,
|
| 397 |
+
"avg_steps_to_solution": float(info["step_count"]),
|
| 398 |
+
"deanonymization_accuracy": 0.0,
|
| 399 |
+
"avg_reward": float(info["total_reward"]),
|
| 400 |
+
"leaderboard_score": 0.0,
|
| 401 |
+
},
|
| 402 |
+
"episodes": [
|
| 403 |
+
{
|
| 404 |
+
"task_id": env.state.task.task_id if env.state else "n/a",
|
| 405 |
+
"task_type": env.state.task.task_type if env.state else "n/a",
|
| 406 |
+
"question": env.state.task.question if env.state else "n/a",
|
| 407 |
+
"task_answer": str(info["task_answer"]),
|
| 408 |
+
"agent_answer": str(info["agent_answer"]),
|
| 409 |
+
"graph_f1": 0.0,
|
| 410 |
+
"reward": float(info["total_reward"]),
|
| 411 |
+
"steps": int(info["step_count"]),
|
| 412 |
+
"tool_calls": int(info["tool_calls"]),
|
| 413 |
+
"success": int(info["agent_answer"] == info["task_answer"]),
|
| 414 |
+
}
|
| 415 |
+
],
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
graph_f1 = 0.0
|
| 419 |
+
if env.state is not None:
|
| 420 |
+
graph_f1 = compute_graph_f1(env.memory_graph.edges, env.state.task.supporting_edges)
|
| 421 |
+
|
| 422 |
+
if evaluation is None:
|
| 423 |
+
summary = {
|
| 424 |
+
"task_success_rate": 0.0,
|
| 425 |
+
"tool_efficiency": 0.0,
|
| 426 |
+
"avg_graph_f1": graph_f1,
|
| 427 |
+
"avg_steps_to_solution": float(env.state.step_count) if env.state else 0.0,
|
| 428 |
+
"deanonymization_accuracy": 0.0,
|
| 429 |
+
"avg_reward": float(env.state.total_reward) if env.state else 0.0,
|
| 430 |
+
"leaderboard_score": 0.0,
|
| 431 |
+
}
|
| 432 |
+
evaluation = {"summary": summary, "episodes": []}
|
| 433 |
+
|
| 434 |
+
leaderboard = load_leaderboard(leaderboard_path)
|
| 435 |
+
out = export_dashboard(env=env, evaluation=evaluation, leaderboard_records=leaderboard, output_path=args.output)
|
| 436 |
+
print(json.dumps({"dashboard": out, "evaluation": evaluation_path}, indent=2, sort_keys=True))
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
if __name__ == "__main__":
|
| 440 |
+
main()
|
src/osint_env/config/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from osint_env.config.shared import RuntimeDefaults, SharedConfig, clone_environment_config, load_seeding_config, load_shared_config
|
| 2 |
+
|
| 3 |
+
__all__ = [
|
| 4 |
+
"RuntimeDefaults",
|
| 5 |
+
"SharedConfig",
|
| 6 |
+
"clone_environment_config",
|
| 7 |
+
"load_seeding_config",
|
| 8 |
+
"load_shared_config",
|
| 9 |
+
]
|
src/osint_env/config/shared.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import copy
|
| 4 |
+
import json
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from osint_env.domain.models import (
|
| 10 |
+
EnvironmentConfig,
|
| 11 |
+
LLMConfig,
|
| 12 |
+
NodeType,
|
| 13 |
+
SeedingConfig,
|
| 14 |
+
SeedEdgeSpec,
|
| 15 |
+
SeedNodeSpec,
|
| 16 |
+
SeedQuestionSpec,
|
| 17 |
+
SpawnRewardConfig,
|
| 18 |
+
SwarmConfig,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass(slots=True)
|
| 23 |
+
class RuntimeDefaults:
|
| 24 |
+
default_episodes: int = 20
|
| 25 |
+
leaderboard_path: str = "artifacts/leaderboard.json"
|
| 26 |
+
dashboard_path: str = "artifacts/osint_dashboard.html"
|
| 27 |
+
sweep_dashboard_dir: str = "artifacts/sweep_dashboards"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass(slots=True)
|
| 31 |
+
class SharedConfig:
|
| 32 |
+
environment: EnvironmentConfig = field(default_factory=EnvironmentConfig)
|
| 33 |
+
runtime: RuntimeDefaults = field(default_factory=RuntimeDefaults)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def clone_environment_config(config: EnvironmentConfig) -> EnvironmentConfig:
|
| 37 |
+
return copy.deepcopy(config)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _as_dict(value: Any) -> dict[str, Any]:
|
| 41 |
+
return value if isinstance(value, dict) else {}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _parse_int(value: Any, default: int) -> int:
|
| 45 |
+
try:
|
| 46 |
+
return int(value)
|
| 47 |
+
except (TypeError, ValueError):
|
| 48 |
+
return default
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _parse_float(value: Any, default: float) -> float:
|
| 52 |
+
try:
|
| 53 |
+
return float(value)
|
| 54 |
+
except (TypeError, ValueError):
|
| 55 |
+
return default
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _parse_bool(value: Any, default: bool) -> bool:
|
| 59 |
+
if isinstance(value, bool):
|
| 60 |
+
return value
|
| 61 |
+
if isinstance(value, str):
|
| 62 |
+
lowered = value.strip().lower()
|
| 63 |
+
if lowered in {"1", "true", "yes", "y", "on"}:
|
| 64 |
+
return True
|
| 65 |
+
if lowered in {"0", "false", "no", "n", "off"}:
|
| 66 |
+
return False
|
| 67 |
+
return default
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _parse_str_list(value: Any, default: list[str]) -> list[str]:
|
| 71 |
+
if isinstance(value, list):
|
| 72 |
+
items = [str(item).strip() for item in value if str(item).strip()]
|
| 73 |
+
return items or list(default)
|
| 74 |
+
if isinstance(value, str):
|
| 75 |
+
items = [part.strip() for part in value.split(",") if part.strip()]
|
| 76 |
+
return items or list(default)
|
| 77 |
+
return list(default)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _infer_node_type(node_id: str) -> NodeType:
|
| 81 |
+
prefix = str(node_id).split("_", 1)[0].lower()
|
| 82 |
+
mapping = {
|
| 83 |
+
"user": NodeType.USER,
|
| 84 |
+
"alias": NodeType.ALIAS,
|
| 85 |
+
"org": NodeType.ORG,
|
| 86 |
+
"loc": NodeType.LOCATION,
|
| 87 |
+
"location": NodeType.LOCATION,
|
| 88 |
+
"post": NodeType.POST,
|
| 89 |
+
"thr": NodeType.THREAD,
|
| 90 |
+
"thread": NodeType.THREAD,
|
| 91 |
+
"event": NodeType.EVENT,
|
| 92 |
+
}
|
| 93 |
+
return mapping.get(prefix, NodeType.USER)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _parse_node_type(value: Any, node_id: str) -> NodeType:
|
| 97 |
+
if isinstance(value, NodeType):
|
| 98 |
+
return value
|
| 99 |
+
if isinstance(value, str):
|
| 100 |
+
raw = value.strip().lower()
|
| 101 |
+
try:
|
| 102 |
+
return NodeType(raw)
|
| 103 |
+
except ValueError:
|
| 104 |
+
return _infer_node_type(node_id)
|
| 105 |
+
return _infer_node_type(node_id)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _parse_seed_edge(item: dict[str, Any]) -> SeedEdgeSpec | None:
|
| 109 |
+
src = str(item.get("src", "")).strip()
|
| 110 |
+
rel = str(item.get("rel", "")).strip()
|
| 111 |
+
dst = str(item.get("dst", "")).strip()
|
| 112 |
+
if not src or not rel or not dst:
|
| 113 |
+
return None
|
| 114 |
+
confidence = _parse_float(item.get("confidence", 1.0), 1.0)
|
| 115 |
+
return SeedEdgeSpec(src=src, rel=rel, dst=dst, confidence=confidence)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _parse_seeding(data: dict[str, Any]) -> SeedingConfig:
|
| 119 |
+
seeded_nodes: list[SeedNodeSpec] = []
|
| 120 |
+
for item in data.get("seeded_nodes", []):
|
| 121 |
+
row = _as_dict(item)
|
| 122 |
+
node_id = str(row.get("node_id", "")).strip()
|
| 123 |
+
if not node_id:
|
| 124 |
+
continue
|
| 125 |
+
node_type = _parse_node_type(row.get("node_type"), node_id)
|
| 126 |
+
attrs = _as_dict(row.get("attrs"))
|
| 127 |
+
seeded_nodes.append(SeedNodeSpec(node_id=node_id, node_type=node_type, attrs=attrs))
|
| 128 |
+
|
| 129 |
+
seeded_edges: list[SeedEdgeSpec] = []
|
| 130 |
+
for item in data.get("seeded_edges", []):
|
| 131 |
+
edge = _parse_seed_edge(_as_dict(item))
|
| 132 |
+
if edge is not None:
|
| 133 |
+
seeded_edges.append(edge)
|
| 134 |
+
|
| 135 |
+
seeded_questions: list[SeedQuestionSpec] = []
|
| 136 |
+
for item in data.get("seeded_questions", []):
|
| 137 |
+
row = _as_dict(item)
|
| 138 |
+
question = str(row.get("question", "")).strip()
|
| 139 |
+
if not question:
|
| 140 |
+
continue
|
| 141 |
+
answer_val = row.get("answer")
|
| 142 |
+
answer = str(answer_val).strip() if answer_val is not None and str(answer_val).strip() else None
|
| 143 |
+
task_type = str(row.get("task_type", "seeded")).strip() or "seeded"
|
| 144 |
+
support_edges: list[SeedEdgeSpec] = []
|
| 145 |
+
for edge_item in row.get("supporting_edges", []):
|
| 146 |
+
edge = _parse_seed_edge(_as_dict(edge_item))
|
| 147 |
+
if edge is not None:
|
| 148 |
+
support_edges.append(edge)
|
| 149 |
+
metadata = _as_dict(row.get("metadata"))
|
| 150 |
+
seeded_questions.append(
|
| 151 |
+
SeedQuestionSpec(
|
| 152 |
+
question=question,
|
| 153 |
+
answer=answer,
|
| 154 |
+
task_type=task_type,
|
| 155 |
+
supporting_edges=support_edges,
|
| 156 |
+
metadata=metadata,
|
| 157 |
+
)
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
return SeedingConfig(
|
| 161 |
+
seeded_nodes=seeded_nodes,
|
| 162 |
+
seeded_edges=seeded_edges,
|
| 163 |
+
seeded_questions=seeded_questions,
|
| 164 |
+
llm_generate_remaining_graph=_parse_bool(data.get("llm_generate_remaining_graph"), True),
|
| 165 |
+
llm_generate_remaining_tasks=_parse_bool(data.get("llm_generate_remaining_tasks"), True),
|
| 166 |
+
llm_generated_edge_budget=max(0, _parse_int(data.get("llm_generated_edge_budget"), 6)),
|
| 167 |
+
llm_generated_task_budget=max(0, _parse_int(data.get("llm_generated_task_budget"), 8)),
|
| 168 |
+
llm_generation_parallel=_parse_bool(data.get("llm_generation_parallel"), True),
|
| 169 |
+
llm_generation_workers=max(1, _parse_int(data.get("llm_generation_workers"), 3)),
|
| 170 |
+
llm_generation_retries=max(1, _parse_int(data.get("llm_generation_retries"), 2)),
|
| 171 |
+
allow_template_fallback_on_llm_failure=_parse_bool(
|
| 172 |
+
data.get("allow_template_fallback_on_llm_failure"),
|
| 173 |
+
False,
|
| 174 |
+
),
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def load_seeding_config(path: str | Path) -> SeedingConfig:
|
| 179 |
+
payload = json.loads(Path(path).read_text(encoding="utf-8"))
|
| 180 |
+
if not isinstance(payload, dict):
|
| 181 |
+
raise ValueError("Seed file must contain a JSON object.")
|
| 182 |
+
source = _as_dict(payload.get("seeding", payload))
|
| 183 |
+
return _parse_seeding(source)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def _parse_environment(payload: dict[str, Any]) -> EnvironmentConfig:
|
| 187 |
+
env_data = _as_dict(payload.get("environment", payload))
|
| 188 |
+
dataset_data = _as_dict(payload.get("dataset", env_data.get("dataset", {})))
|
| 189 |
+
swarm_data = _as_dict(payload.get("swarm", env_data.get("swarm", {})))
|
| 190 |
+
spawn_data = _as_dict(payload.get("spawn_reward", env_data.get("spawn_reward", {})))
|
| 191 |
+
seeding_data = _as_dict(payload.get("seeding", env_data.get("seeding", {})))
|
| 192 |
+
llm_data = _as_dict(payload.get("llm", env_data.get("llm", {})))
|
| 193 |
+
|
| 194 |
+
dataset_mode = str(dataset_data.get("mode", env_data.get("dataset_mode", "canonical"))).strip().lower()
|
| 195 |
+
if dataset_mode not in {"canonical", "metaqa"}:
|
| 196 |
+
dataset_mode = "canonical"
|
| 197 |
+
|
| 198 |
+
metaqa_variant = str(dataset_data.get("metaqa_variant", env_data.get("metaqa_variant", "vanilla"))).strip().lower()
|
| 199 |
+
if metaqa_variant not in {"vanilla", "ntm"}:
|
| 200 |
+
metaqa_variant = "vanilla"
|
| 201 |
+
|
| 202 |
+
env = EnvironmentConfig(
|
| 203 |
+
n_users=max(4, _parse_int(env_data.get("n_users"), 40)),
|
| 204 |
+
alias_density=max(0.0, min(1.0, _parse_float(env_data.get("alias_density"), 0.35))),
|
| 205 |
+
noise_level=max(0.0, min(1.0, _parse_float(env_data.get("noise_level"), 0.15))),
|
| 206 |
+
red_herring_rate=max(0.0, min(1.0, _parse_float(env_data.get("red_herring_rate"), 0.1))),
|
| 207 |
+
max_steps=max(2, _parse_int(env_data.get("max_steps"), 18)),
|
| 208 |
+
seed=_parse_int(env_data.get("seed"), 7),
|
| 209 |
+
dataset_mode=dataset_mode,
|
| 210 |
+
metaqa_root=str(dataset_data.get("metaqa_root", env_data.get("metaqa_root", "metaQA"))).strip() or "metaQA",
|
| 211 |
+
metaqa_kb_path=str(dataset_data.get("metaqa_kb_path", env_data.get("metaqa_kb_path", ""))).strip(),
|
| 212 |
+
metaqa_variant=metaqa_variant,
|
| 213 |
+
metaqa_hops=_parse_str_list(
|
| 214 |
+
dataset_data.get("metaqa_hops", env_data.get("metaqa_hops", ["1-hop", "2-hop", "3-hop"])),
|
| 215 |
+
["1-hop", "2-hop", "3-hop"],
|
| 216 |
+
),
|
| 217 |
+
metaqa_splits=_parse_str_list(
|
| 218 |
+
dataset_data.get("metaqa_splits", env_data.get("metaqa_splits", ["train", "dev", "test"])),
|
| 219 |
+
["train", "dev", "test"],
|
| 220 |
+
),
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
env.swarm = SwarmConfig(
|
| 224 |
+
enabled=_parse_bool(swarm_data.get("enabled"), False),
|
| 225 |
+
max_agents=max(1, _parse_int(swarm_data.get("max_agents"), 3)),
|
| 226 |
+
max_breadth=max(1, _parse_int(swarm_data.get("max_breadth"), 2)),
|
| 227 |
+
max_width=max(1, _parse_int(swarm_data.get("max_width"), 2)),
|
| 228 |
+
max_depth=max(1, _parse_int(swarm_data.get("max_depth"), 2)),
|
| 229 |
+
planner_rounds=max(1, _parse_int(swarm_data.get("planner_rounds"), 2)),
|
| 230 |
+
tools_per_agent=max(1, _parse_int(swarm_data.get("tools_per_agent"), 1)),
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
env.spawn_reward = SpawnRewardConfig(
|
| 234 |
+
lambda_parallel=max(0.0, _parse_float(spawn_data.get("lambda_parallel"), 0.15)),
|
| 235 |
+
lambda_finish=max(0.0, _parse_float(spawn_data.get("lambda_finish"), 0.2)),
|
| 236 |
+
anneal=max(0.0, min(1.0, _parse_float(spawn_data.get("anneal"), 1.0))),
|
| 237 |
+
max_parallel_hint=max(1, _parse_int(spawn_data.get("max_parallel_hint"), 3)),
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
env.seeding = _parse_seeding(seeding_data)
|
| 241 |
+
env.llm = LLMConfig(
|
| 242 |
+
provider=str(llm_data.get("provider", "mock")).strip() or "mock",
|
| 243 |
+
model=str(llm_data.get("model", "qwen3:2b")).strip() or "qwen3:2b",
|
| 244 |
+
temperature=_parse_float(llm_data.get("temperature"), 0.1),
|
| 245 |
+
max_tokens=max(1, _parse_int(llm_data.get("max_tokens"), 256)),
|
| 246 |
+
timeout_seconds=max(1, _parse_int(llm_data.get("timeout_seconds"), 240)),
|
| 247 |
+
ollama_base_url=str(llm_data.get("ollama_base_url", "http://127.0.0.1:11434")).strip()
|
| 248 |
+
or "http://127.0.0.1:11434",
|
| 249 |
+
openai_base_url=str(llm_data.get("openai_base_url", "https://api.openai.com/v1")).strip()
|
| 250 |
+
or "https://api.openai.com/v1",
|
| 251 |
+
openai_api_key_env=str(llm_data.get("openai_api_key_env", "OPENAI_API_KEY")).strip() or "OPENAI_API_KEY",
|
| 252 |
+
openai_api_key=str(llm_data.get("openai_api_key", "")).strip(),
|
| 253 |
+
)
|
| 254 |
+
return env
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def _parse_runtime(payload: dict[str, Any]) -> RuntimeDefaults:
|
| 258 |
+
runtime = _as_dict(payload.get("runtime", {}))
|
| 259 |
+
return RuntimeDefaults(
|
| 260 |
+
default_episodes=max(1, _parse_int(runtime.get("default_episodes"), 20)),
|
| 261 |
+
leaderboard_path=str(runtime.get("leaderboard_path", "artifacts/leaderboard.json")),
|
| 262 |
+
dashboard_path=str(runtime.get("dashboard_path", "artifacts/osint_dashboard.html")),
|
| 263 |
+
sweep_dashboard_dir=str(runtime.get("sweep_dashboard_dir", "artifacts/sweep_dashboards")),
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def load_shared_config(path: str | Path | None) -> SharedConfig:
|
| 268 |
+
if not path:
|
| 269 |
+
return SharedConfig()
|
| 270 |
+
|
| 271 |
+
file_path = Path(path)
|
| 272 |
+
if not file_path.exists():
|
| 273 |
+
return SharedConfig()
|
| 274 |
+
|
| 275 |
+
payload = json.loads(file_path.read_text(encoding="utf-8"))
|
| 276 |
+
if not isinstance(payload, dict):
|
| 277 |
+
raise ValueError("Shared config file must contain a JSON object.")
|
| 278 |
+
|
| 279 |
+
return SharedConfig(environment=_parse_environment(payload), runtime=_parse_runtime(payload))
|
src/osint_env/data/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dataset generation package."""
|
| 2 |
+
|