Spaces:
Sleeping
Sleeping
Commit ·
87f8562
1
Parent(s): 4de4725
Removed unintended embedded repo and added to gitignore
Browse files- .gitignore +1 -0
- .tmp_compare/Meta-s-LedgerShield +1 -0
- README.md +5 -4
- datasets/fixed_levels/leaderboard_fixed_levels.json +43 -0
- inference.py +1 -1
- openenv.yaml +4 -4
- server.py +43 -4
- src/osint_env/api/models.py +4 -1
- tests/test_server.py +36 -0
.gitignore
CHANGED
|
@@ -4,3 +4,4 @@ blueprint.txt
|
|
| 4 |
artifacts/*
|
| 5 |
*.html
|
| 6 |
.venv/
|
|
|
|
|
|
| 4 |
artifacts/*
|
| 5 |
*.html
|
| 6 |
.venv/
|
| 7 |
+
.tmp_compare/
|
.tmp_compare/Meta-s-LedgerShield
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit fd5c9b60ddfbd2eba9d09001938b63169ac98f7b
|
README.md
CHANGED
|
@@ -209,12 +209,13 @@ The FastAPI app serves:
|
|
| 209 |
- `/`: overview page
|
| 210 |
- `/dashboard`: generated benchmark dashboard
|
| 211 |
- `/api/environment`: environment metadata
|
| 212 |
-
- `/
|
|
|
|
| 213 |
- `/openenv.yaml`: OpenEnv HTTP spec stub
|
| 214 |
- `/openenv/tasks`: task enumeration
|
| 215 |
-
- `/openenv/reset`: episode reset
|
| 216 |
-
- `/openenv/step`: episode step
|
| 217 |
-
- `/openenv/state/{session_id}`:
|
| 218 |
|
| 219 |
## Automated Validation
|
| 220 |
|
|
|
|
| 209 |
- `/`: overview page
|
| 210 |
- `/dashboard`: generated benchmark dashboard
|
| 211 |
- `/api/environment`: environment metadata
|
| 212 |
+
- `/health`: health check (validator-friendly alias)
|
| 213 |
+
- `/healthz`: health check (legacy alias)
|
| 214 |
- `/openenv.yaml`: OpenEnv HTTP spec stub
|
| 215 |
- `/openenv/tasks`: task enumeration
|
| 216 |
+
- `/reset` and `/openenv/reset`: episode reset endpoints
|
| 217 |
+
- `/step` and `/openenv/step`: episode step endpoints
|
| 218 |
+
- `/state` and `/openenv/state/{session_id}`: session state endpoints (`/state` returns the latest session)
|
| 219 |
|
| 220 |
## Automated Validation
|
| 221 |
|
datasets/fixed_levels/leaderboard_fixed_levels.json
CHANGED
|
@@ -881,5 +881,48 @@
|
|
| 881 |
},
|
| 882 |
"run_id": "run_0021",
|
| 883 |
"run_name": "fixed_levels_qwen_swarm"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
}
|
| 885 |
]
|
|
|
|
| 881 |
},
|
| 882 |
"run_id": "run_0021",
|
| 883 |
"run_name": "fixed_levels_qwen_swarm"
|
| 884 |
+
},
|
| 885 |
+
{
|
| 886 |
+
"config": {
|
| 887 |
+
"llm_model": "gpt-5.4-mini",
|
| 888 |
+
"llm_provider": "openai",
|
| 889 |
+
"max_agents": 3,
|
| 890 |
+
"max_breadth": 2,
|
| 891 |
+
"max_depth": 2,
|
| 892 |
+
"max_steps": 24,
|
| 893 |
+
"max_width": 2,
|
| 894 |
+
"seed": 2026,
|
| 895 |
+
"seeded_questions": 30,
|
| 896 |
+
"swarm_enabled": true
|
| 897 |
+
},
|
| 898 |
+
"created_at": "2026-04-07T15:59:20+00:00",
|
| 899 |
+
"episodes": 1,
|
| 900 |
+
"metrics": {
|
| 901 |
+
"avg_compactness_reward": 0.0,
|
| 902 |
+
"avg_connectivity_gain_reward": 0.0,
|
| 903 |
+
"avg_connectivity_reward": 0.0,
|
| 904 |
+
"avg_diversity_reward": 0.0,
|
| 905 |
+
"avg_entity_informativeness_reward": 0.0,
|
| 906 |
+
"avg_format_reward": 0.15,
|
| 907 |
+
"avg_graph_f1": 0.0,
|
| 908 |
+
"avg_knowledge_carrier_reward": 0.0,
|
| 909 |
+
"avg_knowledge_indexing_reward": 0.0,
|
| 910 |
+
"avg_relation_informativeness_reward": 0.0,
|
| 911 |
+
"avg_reward": 0.5519400198339021,
|
| 912 |
+
"avg_soft_shaping_reward": 0.0,
|
| 913 |
+
"avg_spawn_count": 0.0,
|
| 914 |
+
"avg_spawn_critical_steps": 0.0,
|
| 915 |
+
"avg_steps_to_solution": 1.0,
|
| 916 |
+
"deanonymization_accuracy": 0.0,
|
| 917 |
+
"leaderboard_score": 0.2785970009916951,
|
| 918 |
+
"retrieval_signal": 0.5,
|
| 919 |
+
"spawn_completion_rate": 0.0,
|
| 920 |
+
"spawn_signal": 0.4,
|
| 921 |
+
"structural_signal": 0.5,
|
| 922 |
+
"task_success_rate": 0.0,
|
| 923 |
+
"tool_efficiency": 1.0
|
| 924 |
+
},
|
| 925 |
+
"run_id": "run_0022",
|
| 926 |
+
"run_name": "fixed_levels_qwen_swarm"
|
| 927 |
}
|
| 928 |
]
|
inference.py
CHANGED
|
@@ -97,7 +97,7 @@ def log_step(step: int, action: str, reward: float, done: bool, error: str | Non
|
|
| 97 |
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 98 |
rewards_text = ",".join(f"{value:.2f}" for value in rewards)
|
| 99 |
print(
|
| 100 |
-
f"[END] success={str(bool(success)).lower()} steps={steps} score={score:.
|
| 101 |
flush=True,
|
| 102 |
)
|
| 103 |
|
|
|
|
| 97 |
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 98 |
rewards_text = ",".join(f"{value:.2f}" for value in rewards)
|
| 99 |
print(
|
| 100 |
+
f"[END] success={str(bool(success)).lower()} steps={steps} score={score:.2f} rewards={rewards_text}",
|
| 101 |
flush=True,
|
| 102 |
)
|
| 103 |
|
openenv.yaml
CHANGED
|
@@ -7,7 +7,7 @@ transport:
|
|
| 7 |
endpoints:
|
| 8 |
health:
|
| 9 |
method: GET
|
| 10 |
-
path: /
|
| 11 |
metadata:
|
| 12 |
method: GET
|
| 13 |
path: /api/environment
|
|
@@ -16,13 +16,13 @@ endpoints:
|
|
| 16 |
path: /openenv/tasks
|
| 17 |
reset:
|
| 18 |
method: POST
|
| 19 |
-
path: /
|
| 20 |
step:
|
| 21 |
method: POST
|
| 22 |
-
path: /
|
| 23 |
state:
|
| 24 |
method: GET
|
| 25 |
-
path: /
|
| 26 |
models:
|
| 27 |
action_space:
|
| 28 |
- CALL_TOOL
|
|
|
|
| 7 |
endpoints:
|
| 8 |
health:
|
| 9 |
method: GET
|
| 10 |
+
path: /health
|
| 11 |
metadata:
|
| 12 |
method: GET
|
| 13 |
path: /api/environment
|
|
|
|
| 16 |
path: /openenv/tasks
|
| 17 |
reset:
|
| 18 |
method: POST
|
| 19 |
+
path: /reset
|
| 20 |
step:
|
| 21 |
method: POST
|
| 22 |
+
path: /step
|
| 23 |
state:
|
| 24 |
method: GET
|
| 25 |
+
path: /state
|
| 26 |
models:
|
| 27 |
action_space:
|
| 28 |
- CALL_TOOL
|
server.py
CHANGED
|
@@ -43,6 +43,7 @@ OPENENV_SPEC_PATH = Path("openenv.yaml")
|
|
| 43 |
_SESSION_LOCK = Lock()
|
| 44 |
_SESSIONS: dict[str, OSINTEnvironment] = {}
|
| 45 |
_RESET_COUNTER = 0
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def _load_json(path: Path) -> dict[str, Any] | None:
|
|
@@ -147,8 +148,26 @@ def _get_session_env(session_id: str) -> OSINTEnvironment:
|
|
| 147 |
|
| 148 |
|
| 149 |
def _store_session(session_id: str, env: OSINTEnvironment) -> None:
|
|
|
|
| 150 |
with _SESSION_LOCK:
|
| 151 |
_SESSIONS[session_id] = env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
|
| 154 |
def _task_lookup(env: OSINTEnvironment) -> dict[str, Any]:
|
|
@@ -400,6 +419,11 @@ def healthz() -> JSONResponse:
|
|
| 400 |
return JSONResponse({"status": "ok"})
|
| 401 |
|
| 402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
@app.get("/openenv.yaml")
|
| 404 |
def openenv_spec() -> FileResponse:
|
| 405 |
return FileResponse(OPENENV_SPEC_PATH, media_type="text/yaml")
|
|
@@ -456,8 +480,12 @@ async def openenv_reset(request: Request) -> OpenEnvResponseEnvelope:
|
|
| 456 |
|
| 457 |
|
| 458 |
@app.post("/openenv/step", response_model=OpenEnvResponseEnvelope)
|
|
|
|
|
|
|
|
|
|
| 459 |
def openenv_step(request: OpenEnvActionRequest) -> OpenEnvResponseEnvelope:
|
| 460 |
-
|
|
|
|
| 461 |
action_type_raw = request.resolved_action_type().strip()
|
| 462 |
if not action_type_raw:
|
| 463 |
raise HTTPException(status_code=400, detail="Missing action_type")
|
|
@@ -467,7 +495,7 @@ def openenv_step(request: OpenEnvActionRequest) -> OpenEnvResponseEnvelope:
|
|
| 467 |
raise HTTPException(status_code=400, detail=f"Unsupported action_type {action_type_raw}") from exc
|
| 468 |
observation, reward, done, info = env.step(Action(action_type=action_type, payload=request.resolved_payload()))
|
| 469 |
return OpenEnvResponseEnvelope(
|
| 470 |
-
session_id=
|
| 471 |
observation=_serialize_observation(observation),
|
| 472 |
reward=float(reward),
|
| 473 |
done=bool(done),
|
|
@@ -475,8 +503,7 @@ def openenv_step(request: OpenEnvActionRequest) -> OpenEnvResponseEnvelope:
|
|
| 475 |
)
|
| 476 |
|
| 477 |
|
| 478 |
-
|
| 479 |
-
def openenv_state(session_id: str) -> OpenEnvResponseEnvelope:
|
| 480 |
env = _get_session_env(session_id)
|
| 481 |
if env.state is None:
|
| 482 |
raise HTTPException(status_code=400, detail="Session has not been reset yet")
|
|
@@ -489,6 +516,18 @@ def openenv_state(session_id: str) -> OpenEnvResponseEnvelope:
|
|
| 489 |
)
|
| 490 |
|
| 491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
@app.post("/openenv/report_inference", response_model=OpenEnvInferenceReportResponse)
|
| 493 |
def openenv_report_inference(request: OpenEnvInferenceReportRequest) -> OpenEnvInferenceReportResponse:
|
| 494 |
env = _build_environment()
|
|
|
|
| 43 |
_SESSION_LOCK = Lock()
|
| 44 |
_SESSIONS: dict[str, OSINTEnvironment] = {}
|
| 45 |
_RESET_COUNTER = 0
|
| 46 |
+
_LATEST_SESSION_ID: str | None = None
|
| 47 |
|
| 48 |
|
| 49 |
def _load_json(path: Path) -> dict[str, Any] | None:
|
|
|
|
| 148 |
|
| 149 |
|
| 150 |
def _store_session(session_id: str, env: OSINTEnvironment) -> None:
|
| 151 |
+
global _LATEST_SESSION_ID
|
| 152 |
with _SESSION_LOCK:
|
| 153 |
_SESSIONS[session_id] = env
|
| 154 |
+
_LATEST_SESSION_ID = session_id
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _latest_session_id() -> str:
|
| 158 |
+
with _SESSION_LOCK:
|
| 159 |
+
if _LATEST_SESSION_ID and _LATEST_SESSION_ID in _SESSIONS:
|
| 160 |
+
return _LATEST_SESSION_ID
|
| 161 |
+
if _SESSIONS:
|
| 162 |
+
return next(reversed(_SESSIONS))
|
| 163 |
+
raise HTTPException(status_code=404, detail="No active session. Call /reset first.")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _resolve_session_id(session_id: str | None) -> str:
|
| 167 |
+
token = str(session_id or "").strip()
|
| 168 |
+
if token:
|
| 169 |
+
return token
|
| 170 |
+
return _latest_session_id()
|
| 171 |
|
| 172 |
|
| 173 |
def _task_lookup(env: OSINTEnvironment) -> dict[str, Any]:
|
|
|
|
| 419 |
return JSONResponse({"status": "ok"})
|
| 420 |
|
| 421 |
|
| 422 |
+
@app.get("/health")
|
| 423 |
+
def health() -> JSONResponse:
|
| 424 |
+
return healthz()
|
| 425 |
+
|
| 426 |
+
|
| 427 |
@app.get("/openenv.yaml")
|
| 428 |
def openenv_spec() -> FileResponse:
|
| 429 |
return FileResponse(OPENENV_SPEC_PATH, media_type="text/yaml")
|
|
|
|
| 480 |
|
| 481 |
|
| 482 |
@app.post("/openenv/step", response_model=OpenEnvResponseEnvelope)
|
| 483 |
+
@app.post("/openenv/step/", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 484 |
+
@app.post("/step", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 485 |
+
@app.post("/step/", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 486 |
def openenv_step(request: OpenEnvActionRequest) -> OpenEnvResponseEnvelope:
|
| 487 |
+
session_id = _resolve_session_id(request.session_id)
|
| 488 |
+
env = _get_session_env(session_id)
|
| 489 |
action_type_raw = request.resolved_action_type().strip()
|
| 490 |
if not action_type_raw:
|
| 491 |
raise HTTPException(status_code=400, detail="Missing action_type")
|
|
|
|
| 495 |
raise HTTPException(status_code=400, detail=f"Unsupported action_type {action_type_raw}") from exc
|
| 496 |
observation, reward, done, info = env.step(Action(action_type=action_type, payload=request.resolved_payload()))
|
| 497 |
return OpenEnvResponseEnvelope(
|
| 498 |
+
session_id=session_id,
|
| 499 |
observation=_serialize_observation(observation),
|
| 500 |
reward=float(reward),
|
| 501 |
done=bool(done),
|
|
|
|
| 503 |
)
|
| 504 |
|
| 505 |
|
| 506 |
+
def _state_response(session_id: str) -> OpenEnvResponseEnvelope:
|
|
|
|
| 507 |
env = _get_session_env(session_id)
|
| 508 |
if env.state is None:
|
| 509 |
raise HTTPException(status_code=400, detail="Session has not been reset yet")
|
|
|
|
| 516 |
)
|
| 517 |
|
| 518 |
|
| 519 |
+
@app.get("/openenv/state/{session_id}", response_model=OpenEnvResponseEnvelope)
|
| 520 |
+
def openenv_state(session_id: str) -> OpenEnvResponseEnvelope:
|
| 521 |
+
return _state_response(session_id)
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
@app.get("/openenv/state", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 525 |
+
@app.get("/state", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 526 |
+
@app.get("/state/", response_model=OpenEnvResponseEnvelope, include_in_schema=False)
|
| 527 |
+
def openenv_state_latest() -> OpenEnvResponseEnvelope:
|
| 528 |
+
return _state_response(_latest_session_id())
|
| 529 |
+
|
| 530 |
+
|
| 531 |
@app.post("/openenv/report_inference", response_model=OpenEnvInferenceReportResponse)
|
| 532 |
def openenv_report_inference(request: OpenEnvInferenceReportRequest) -> OpenEnvInferenceReportResponse:
|
| 533 |
env = _build_environment()
|
src/osint_env/api/models.py
CHANGED
|
@@ -26,7 +26,10 @@ class OpenEnvResetRequest(BaseModel):
|
|
| 26 |
|
| 27 |
|
| 28 |
class OpenEnvActionRequest(BaseModel):
|
| 29 |
-
session_id: str
|
|
|
|
|
|
|
|
|
|
| 30 |
action_type: str | None = Field(default=None, description="One of CALL_TOOL, ADD_EDGE, ANSWER.")
|
| 31 |
payload: dict[str, Any] = Field(default_factory=dict)
|
| 32 |
action: dict[str, Any] | None = None
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
class OpenEnvActionRequest(BaseModel):
|
| 29 |
+
session_id: str | None = Field(
|
| 30 |
+
default=None,
|
| 31 |
+
description="Session identifier. Optional for /step compatibility alias, which uses the latest session.",
|
| 32 |
+
)
|
| 33 |
action_type: str | None = Field(default=None, description="One of CALL_TOOL, ADD_EDGE, ANSWER.")
|
| 34 |
payload: dict[str, Any] = Field(default_factory=dict)
|
| 35 |
action: dict[str, Any] | None = None
|
tests/test_server.py
CHANGED
|
@@ -16,6 +16,12 @@ def test_server_health():
|
|
| 16 |
assert response.json()["status"] == "ok"
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def test_server_environment_metadata():
|
| 20 |
response = client.get("/api/environment")
|
| 21 |
assert response.status_code == 200
|
|
@@ -115,6 +121,36 @@ def test_openenv_step_accepts_nested_action_payload():
|
|
| 115 |
assert step.json()["done"] is True
|
| 116 |
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def test_report_inference_updates_latest_evaluation_and_dashboard(tmp_path, monkeypatch):
|
| 119 |
latest_evaluation = tmp_path / "latest_evaluation.json"
|
| 120 |
space_dashboard = tmp_path / "space_dashboard.html"
|
|
|
|
| 16 |
assert response.json()["status"] == "ok"
|
| 17 |
|
| 18 |
|
| 19 |
+
def test_server_health_alias():
|
| 20 |
+
response = client.get("/health")
|
| 21 |
+
assert response.status_code == 200
|
| 22 |
+
assert response.json()["status"] == "ok"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
def test_server_environment_metadata():
|
| 26 |
response = client.get("/api/environment")
|
| 27 |
assert response.status_code == 200
|
|
|
|
| 121 |
assert step.json()["done"] is True
|
| 122 |
|
| 123 |
|
| 124 |
+
def test_step_alias_uses_latest_session_when_session_id_missing():
|
| 125 |
+
reset = client.post("/reset", json={"task_index": 0})
|
| 126 |
+
assert reset.status_code == 200
|
| 127 |
+
session_id = reset.json()["session_id"]
|
| 128 |
+
|
| 129 |
+
step = client.post(
|
| 130 |
+
"/step",
|
| 131 |
+
json={
|
| 132 |
+
"action_type": "ANSWER",
|
| 133 |
+
"payload": {"answer": "unknown"},
|
| 134 |
+
},
|
| 135 |
+
)
|
| 136 |
+
assert step.status_code == 200
|
| 137 |
+
body = step.json()
|
| 138 |
+
assert body["session_id"] == session_id
|
| 139 |
+
assert body["done"] is True
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def test_state_alias_returns_latest_session():
|
| 143 |
+
reset = client.post("/reset", json={"task_index": 0})
|
| 144 |
+
assert reset.status_code == 200
|
| 145 |
+
session_id = reset.json()["session_id"]
|
| 146 |
+
|
| 147 |
+
state = client.get("/state")
|
| 148 |
+
assert state.status_code == 200
|
| 149 |
+
body = state.json()
|
| 150 |
+
assert body["session_id"] == session_id
|
| 151 |
+
assert "task" in body["observation"]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
def test_report_inference_updates_latest_evaluation_and_dashboard(tmp_path, monkeypatch):
|
| 155 |
latest_evaluation = tmp_path / "latest_evaluation.json"
|
| 156 |
space_dashboard = tmp_path / "space_dashboard.html"
|