Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- DEMO_SCRIPT.md +21 -12
- Dockerfile +5 -4
- README.md +282 -181
- __init__.py +32 -16
- analyzers/ds_analyzer.py +16 -14
- analyzers/dsa_analyzer.py +8 -7
- analyzers/ml_analyzer.py +16 -14
- analyzers/web_analyzer.py +8 -7
- api/main.py +6 -6
- app/env/__init__.py +3 -3
- app/env/runner.py +55 -102
- app/examples.py +31 -31
- app/models/inference.py +21 -8
- app/services/openai_service.py +6 -2
- app/streamlit_app.py +135 -74
- app/utils/runtime.py +31 -20
- client.py +17 -10
- graders/bug_fix.py +37 -9
- graders/optimization.py +38 -13
- graders/shared.py +104 -23
- graders/syntax.py +37 -9
- models/pytorch_model.py +227 -149
- openenv_python_code_review_env.egg-info/PKG-INFO +34 -25
- openenv_python_code_review_env.egg-info/SOURCES.txt +12 -0
- openenv_python_code_review_env.egg-info/requires.txt +0 -1
- openenv_python_code_review_env.egg-info/top_level.txt +1 -14
- pyproject.toml +22 -6
- schemas/request.py +51 -19
- schemas/response.py +109 -73
- server/Dockerfile +3 -2
- server/app.py +18 -24
- server/demo.py +1 -1
- server/env.py +45 -36
- server/requirements.txt +6 -8
- services/analysis_service.py +258 -139
- services/reward_service.py +56 -38
- services/suggestion_service.py +113 -28
- tests/test_inference_runner.py +53 -6
- tests/test_scoring.py +11 -0
- utils/ast_parser.py +248 -144
- utils/complexity.py +70 -37
- uv.lock +0 -2
DEMO_SCRIPT.md
CHANGED
|
@@ -1,12 +1,21 @@
|
|
| 1 |
-
# TorchReview Copilot Demo Script
|
| 2 |
-
|
| 3 |
-
## 60-90 Second Walkthrough
|
| 4 |
-
|
| 5 |
-
1.
|
| 6 |
-
2.
|
| 7 |
-
3.
|
| 8 |
-
4.
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TorchReview Copilot Demo Script
|
| 2 |
+
|
| 3 |
+
## 60-90 Second Walkthrough
|
| 4 |
+
|
| 5 |
+
1. Introduce TorchReview Copilot as an AI-powered code review system that helps developers find bugs, reduce complexity, and improve maintainability faster.
|
| 6 |
+
2. Frame the problem clearly: manual code reviews are slow, inconsistent, and hard to scale across growing teams and codebases.
|
| 7 |
+
3. Open the Streamlit app and load the `Boundary Bug` example to show a realistic Python regression with failing behavior.
|
| 8 |
+
4. Point out the pipeline on-screen:
|
| 9 |
+
input code, static analysis, PyTorch scoring, suggestions, and RL-ready reward output.
|
| 10 |
+
5. Highlight the PyTorch story:
|
| 11 |
+
the app uses CodeBERTa embeddings through PyTorch to score code quality, maintainability, and domain fit.
|
| 12 |
+
6. Show the headline metrics:
|
| 13 |
+
detected domain, ML score, lint score, and final reward.
|
| 14 |
+
7. Scroll to the reward breakdown and explain that the reward is not arbitrary; it combines ML quality, maintainability, security, lint signals, and complexity penalties.
|
| 15 |
+
8. Open the Suggestions tab and show the prioritized fixes plus the three-step improvement plan.
|
| 16 |
+
9. Switch to the `Performance Hotspot` example to demonstrate that the system adapts to a different issue profile and pushes optimization hints instead of only syntax guidance.
|
| 17 |
+
10. Close by emphasizing that the same repo also works as an OpenEnv environment, so the project is both a usable developer product and an RL-ready benchmark component.
|
| 18 |
+
|
| 19 |
+
## 20-Second Closing Line
|
| 20 |
+
|
| 21 |
+
TorchReview Copilot turns code review into a measurable AI workflow: PyTorch handles semantic scoring, deterministic analyzers keep it grounded, and OpenEnv makes it trainable and benchmarkable.
|
Dockerfile
CHANGED
|
@@ -6,14 +6,16 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
| 6 |
PYTHONIOENCODING=utf-8 \
|
| 7 |
PIP_NO_CACHE_DIR=1 \
|
| 8 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
|
| 11 |
WORKDIR /app
|
| 12 |
|
| 13 |
COPY server/requirements.txt /tmp/requirements.txt
|
| 14 |
|
| 15 |
RUN python -m pip install --upgrade pip && \
|
| 16 |
-
pip install -r /tmp/requirements.txt
|
| 17 |
|
| 18 |
COPY . /app
|
| 19 |
|
|
@@ -24,5 +26,4 @@ EXPOSE 8000
|
|
| 24 |
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 25 |
CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()"
|
| 26 |
|
| 27 |
-
|
| 28 |
-
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000", "--no-access-log"]
|
|
|
|
| 6 |
PYTHONIOENCODING=utf-8 \
|
| 7 |
PIP_NO_CACHE_DIR=1 \
|
| 8 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 9 |
+
PIP_DEFAULT_TIMEOUT=120 \
|
| 10 |
+
ENABLE_GRADIO_DEMO=false \
|
| 11 |
+
ENABLE_WEB_INTERFACE=false
|
| 12 |
|
| 13 |
WORKDIR /app
|
| 14 |
|
| 15 |
COPY server/requirements.txt /tmp/requirements.txt
|
| 16 |
|
| 17 |
RUN python -m pip install --upgrade pip && \
|
| 18 |
+
pip install --prefer-binary -r /tmp/requirements.txt
|
| 19 |
|
| 20 |
COPY . /app
|
| 21 |
|
|
|
|
| 26 |
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 27 |
CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()"
|
| 28 |
|
| 29 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
|
README.md
CHANGED
|
@@ -1,181 +1,282 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
sdk: docker
|
| 4 |
-
app_port: 8000
|
| 5 |
-
base_path: /web
|
| 6 |
-
pinned: false
|
| 7 |
-
tags:
|
| 8 |
-
- openenv
|
| 9 |
-
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
``
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
```
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
-
|
| 54 |
-
-
|
| 55 |
-
|
| 56 |
-
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
- The
|
| 64 |
-
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
``
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
``
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
-
|
| 147 |
-
-
|
| 148 |
-
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: TorchReview Copilot
|
| 3 |
+
sdk: docker
|
| 4 |
+
app_port: 8000
|
| 5 |
+
base_path: /web
|
| 6 |
+
pinned: false
|
| 7 |
+
tags:
|
| 8 |
+
- openenv
|
| 9 |
+
- pytorch
|
| 10 |
+
- code-review
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# TorchReview Copilot
|
| 14 |
+
|
| 15 |
+
TorchReview Copilot is an AI-powered code review and improvement system built for the Meta PyTorch OpenEnv Hackathon. It combines deterministic static analysis, a real PyTorch code encoder, domain-aware review logic, and RL-ready reward shaping to help developers catch bugs, reduce complexity, and improve maintainability faster.
|
| 16 |
+
|
| 17 |
+
## Problem Statement
|
| 18 |
+
|
| 19 |
+
Manual code review is slow, inconsistent, and difficult to scale. Small logic bugs slip through, performance hotspots hide in otherwise correct code, and review quality changes from reviewer to reviewer.
|
| 20 |
+
|
| 21 |
+
## Solution
|
| 22 |
+
|
| 23 |
+
TorchReview Copilot accepts Python code, analyzes it with AST and complexity heuristics, scores it with a PyTorch model, and returns:
|
| 24 |
+
|
| 25 |
+
- A code quality score
|
| 26 |
+
- Domain-aware review feedback
|
| 27 |
+
- Actionable improvement suggestions
|
| 28 |
+
- An RL-ready reward signal for OpenEnv environments
|
| 29 |
+
|
| 30 |
+
## Why This Is Hackathon-Worthy
|
| 31 |
+
|
| 32 |
+
- Solves a real developer productivity problem
|
| 33 |
+
- Uses PyTorch meaningfully for model inference, not as a placeholder
|
| 34 |
+
- Produces a measurable reward signal for RL workflows
|
| 35 |
+
- Ships as a usable product with API, UI, docs, tests, and OpenEnv compatibility
|
| 36 |
+
|
| 37 |
+
## Tech Stack
|
| 38 |
+
|
| 39 |
+
- `PyTorch` for model execution and similarity scoring
|
| 40 |
+
- `transformers` with `huggingface/CodeBERTa-small-v1` for pretrained code embeddings
|
| 41 |
+
- `FastAPI` for the analysis API
|
| 42 |
+
- `Streamlit` for the interactive review UI
|
| 43 |
+
- `Pydantic` for request and response validation
|
| 44 |
+
- `OpenAI` Python client for hackathon-compliant LLM action planning in `inference.py`
|
| 45 |
+
- `OpenEnv` for environment, reward, and validator integration
|
| 46 |
+
|
| 47 |
+
## Pipeline
|
| 48 |
+
|
| 49 |
+
```text
|
| 50 |
+
Input Python Code
|
| 51 |
+
-> AST Parsing + Structural Signals
|
| 52 |
+
-> Complexity + Lint Heuristics
|
| 53 |
+
-> PyTorch Model Inference (CodeBERTa / torch fallback)
|
| 54 |
+
-> Domain Analysis + Suggestion Engine
|
| 55 |
+
-> RL Reward Shaping
|
| 56 |
+
-> UI + API + OpenEnv Environment Output
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## PyTorch Integration
|
| 60 |
+
|
| 61 |
+
PyTorch is used in the core scoring path:
|
| 62 |
+
|
| 63 |
+
- The app loads `huggingface/CodeBERTa-small-v1` through `transformers`
|
| 64 |
+
- Input code, repository context, traceback text, and static-analysis hints are embedded with the encoder
|
| 65 |
+
- The resulting embedding is compared against quality, maintainability, domain, and issue prototypes
|
| 66 |
+
- The model produces:
|
| 67 |
+
- `ml_quality_score`
|
| 68 |
+
- `maintainability_score`
|
| 69 |
+
- domain confidences
|
| 70 |
+
- issue probabilities
|
| 71 |
+
|
| 72 |
+
If pretrained weights are unavailable, the project falls back to a torch-native hashed embedding backend so local demos and CI still work offline.
|
| 73 |
+
|
| 74 |
+
## Reward System
|
| 75 |
+
|
| 76 |
+
The system is RL-ready by design. Reward shaping blends model confidence, code quality, security, maintainability, and complexity into a bounded signal.
|
| 77 |
+
|
| 78 |
+
Core reward:
|
| 79 |
+
|
| 80 |
+
```text
|
| 81 |
+
reward = 0.50*ml_score
|
| 82 |
+
+ 0.18*lint_score
|
| 83 |
+
+ 0.12*maintainability_score
|
| 84 |
+
+ 0.10*domain_score
|
| 85 |
+
+ 0.10*security_score
|
| 86 |
+
- 0.20*complexity_penalty
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
The OpenEnv environment adds step-level shaping for:
|
| 90 |
+
|
| 91 |
+
- public test progress
|
| 92 |
+
- syntax recovery
|
| 93 |
+
- runtime improvements
|
| 94 |
+
- error reduction
|
| 95 |
+
- final submission success
|
| 96 |
+
- regressions and invalid actions
|
| 97 |
+
|
| 98 |
+
All task and step rewards are normalized into a strict safe interval for OpenEnv validation and printed in a validator-safe two-decimal band.
|
| 99 |
+
|
| 100 |
+
## Features
|
| 101 |
+
|
| 102 |
+
- Real PyTorch-backed code quality inference
|
| 103 |
+
- Static analysis with syntax, lint, AST, and complexity signals
|
| 104 |
+
- Domain-aware review for DSA, data science, ML/DL, and web code
|
| 105 |
+
- Prioritized suggestions and a compact 3-step improvement plan
|
| 106 |
+
- Auto-fix preview hints for quick wins
|
| 107 |
+
- Real-time Streamlit scoring mode
|
| 108 |
+
- OpenEnv-compatible environment and `inference.py`
|
| 109 |
+
- Deterministic benchmark tasks for syntax fixes, bug fixes, and optimization
|
| 110 |
+
|
| 111 |
+
## WOW Features
|
| 112 |
+
|
| 113 |
+
- Real-time scoring in the Streamlit interface
|
| 114 |
+
- Auto-fix preview panel
|
| 115 |
+
- Reward visualization and score breakdown
|
| 116 |
+
- OpenEnv environment with transparent reward decomposition
|
| 117 |
+
|
| 118 |
+
## Project Structure
|
| 119 |
+
|
| 120 |
+
```text
|
| 121 |
+
root
|
| 122 |
+
|- inference.py
|
| 123 |
+
|- api/
|
| 124 |
+
|- app/
|
| 125 |
+
| |- agents/
|
| 126 |
+
| |- env/
|
| 127 |
+
| |- models/
|
| 128 |
+
| |- services/
|
| 129 |
+
| `- utils/
|
| 130 |
+
|- analyzers/
|
| 131 |
+
|- graders/
|
| 132 |
+
|- models/
|
| 133 |
+
|- schemas/
|
| 134 |
+
|- services/
|
| 135 |
+
|- tasks/
|
| 136 |
+
|- tests/
|
| 137 |
+
`- utils/
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
Key modules:
|
| 141 |
+
|
| 142 |
+
- `models/pytorch_model.py`: PyTorch + transformer inference
|
| 143 |
+
- `services/analysis_service.py`: end-to-end review pipeline
|
| 144 |
+
- `services/reward_service.py`: RL-friendly reward shaping
|
| 145 |
+
- `services/suggestion_service.py`: actionable recommendations
|
| 146 |
+
- `app/streamlit_app.py`: interactive UI
|
| 147 |
+
- `server/env.py`: OpenEnv environment implementation
|
| 148 |
+
- `app/env/runner.py`: strict `inference.py` runner
|
| 149 |
+
|
| 150 |
+
## API
|
| 151 |
+
|
| 152 |
+
Run the analysis API:
|
| 153 |
+
|
| 154 |
+
```bash
|
| 155 |
+
python -m uvicorn api.main:app --host 0.0.0.0 --port 7860
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
Main endpoint:
|
| 159 |
+
|
| 160 |
+
- `POST /analyze`
|
| 161 |
+
|
| 162 |
+
The API returns:
|
| 163 |
+
|
| 164 |
+
- detected domain
|
| 165 |
+
- static-analysis summary
|
| 166 |
+
- model prediction
|
| 167 |
+
- score breakdown
|
| 168 |
+
- suggestions
|
| 169 |
+
- improvement plan
|
| 170 |
+
|
| 171 |
+
## Streamlit UI
|
| 172 |
+
|
| 173 |
+
Run the product UI locally:
|
| 174 |
+
|
| 175 |
+
```bash
|
| 176 |
+
streamlit run app/streamlit_app.py
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
The UI includes:
|
| 180 |
+
|
| 181 |
+
- code input editor
|
| 182 |
+
- example snippets
|
| 183 |
+
- real-time scoring toggle
|
| 184 |
+
- ML score, lint score, and reward display
|
| 185 |
+
- domain confidence chart
|
| 186 |
+
- reward-signal visualization
|
| 187 |
+
- suggestion list and auto-fix preview
|
| 188 |
+
|
| 189 |
+
## OpenEnv Compatibility
|
| 190 |
+
|
| 191 |
+
This repository is also a valid OpenEnv submission:
|
| 192 |
+
|
| 193 |
+
- `inference.py` is in the repo root
|
| 194 |
+
- `API_BASE_URL` and `MODEL_NAME` have defaults
|
| 195 |
+
- `HF_TOKEN` is read from the environment
|
| 196 |
+
- The runner uses the official `OpenAI` Python client
|
| 197 |
+
- Output follows the required `[START]`, `[STEP]`, `[END]` contract
|
| 198 |
+
|
| 199 |
+
Example:
|
| 200 |
+
|
| 201 |
+
```text
|
| 202 |
+
[START] task=syntax_fix_invoice_totals env=python_code_review_env model=Qwen/Qwen2.5-3B-Instruct
|
| 203 |
+
[STEP] step=1 action=run_tests reward=0.34 done=false error=null
|
| 204 |
+
[STEP] step=2 action=edit_code reward=0.42 done=false error=null
|
| 205 |
+
[STEP] step=3 action=submit_solution reward=0.99 done=true error=null
|
| 206 |
+
[END] success=true steps=3 rewards=0.34,0.42,0.99
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
## Setup
|
| 210 |
+
|
| 211 |
+
Install dependencies:
|
| 212 |
+
|
| 213 |
+
```bash
|
| 214 |
+
pip install -e .[dev]
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
Run tests:
|
| 218 |
+
|
| 219 |
+
```bash
|
| 220 |
+
pytest -q
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
Run the OpenEnv server:
|
| 224 |
+
|
| 225 |
+
```bash
|
| 226 |
+
python -m uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 227 |
+
```
|
| 228 |
+
|
| 229 |
+
Run the demo UI mounted into the server:
|
| 230 |
+
|
| 231 |
+
```bash
|
| 232 |
+
set ENABLE_GRADIO_DEMO=true
|
| 233 |
+
set ENABLE_WEB_INTERFACE=true
|
| 234 |
+
python -m uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
## Hugging Face Spaces
|
| 238 |
+
|
| 239 |
+
This repo is designed to run on a Docker-based Hugging Face Space under a `2 vCPU / 8 GB RAM` budget.
|
| 240 |
+
|
| 241 |
+
Recommended Space settings:
|
| 242 |
+
|
| 243 |
+
- SDK: `Docker`
|
| 244 |
+
- Port: `8000`
|
| 245 |
+
- Secret: `HF_TOKEN`
|
| 246 |
+
- Optional vars:
|
| 247 |
+
- `API_BASE_URL`
|
| 248 |
+
- `MODEL_NAME`
|
| 249 |
+
- `ENABLE_GRADIO_DEMO=false`
|
| 250 |
+
- `ENABLE_WEB_INTERFACE=false`
|
| 251 |
+
|
| 252 |
+
## Screenshots
|
| 253 |
+
|
| 254 |
+
Add these before final submission:
|
| 255 |
+
|
| 256 |
+
- Main review UI with code editor and reward metrics
|
| 257 |
+
- Suggestions tab with improvement plan
|
| 258 |
+
- OpenEnv task loop or validator output snippet
|
| 259 |
+
|
| 260 |
+
## Demo Link
|
| 261 |
+
|
| 262 |
+
Add your live Hugging Face Space URL here before final submission.
|
| 263 |
+
|
| 264 |
+
## Demo Script
|
| 265 |
+
|
| 266 |
+
See [DEMO_SCRIPT.md](DEMO_SCRIPT.md) for a concise hackathon walkthrough.
|
| 267 |
+
|
| 268 |
+
## Testing
|
| 269 |
+
|
| 270 |
+
The repo includes coverage for:
|
| 271 |
+
|
| 272 |
+
- score normalization into the strict OpenEnv-safe interval
|
| 273 |
+
- inference output formatting
|
| 274 |
+
- API response structure
|
| 275 |
+
- multi-domain analysis behavior
|
| 276 |
+
- triage and embedding behavior
|
| 277 |
+
|
| 278 |
+
## Notes for Judges
|
| 279 |
+
|
| 280 |
+
- This is not a toy wrapper around an LLM. The review pipeline includes deterministic analysis, PyTorch-based code scoring, and explicit reward shaping.
|
| 281 |
+
- The system is useful both as a developer-facing application and as a benchmark-friendly RL environment.
|
| 282 |
+
- The design intentionally balances product polish with validator reliability.
|
__init__.py
CHANGED
|
@@ -1,19 +1,35 @@
|
|
| 1 |
-
"""Public package exports for python_code_review_env."""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
from .
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
from .
|
| 15 |
-
from .
|
| 16 |
-
from .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
__all__ = [
|
| 19 |
"PythonAction",
|
|
|
|
| 1 |
+
"""Public package exports for python_code_review_env."""
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
from .client import PythonCodeReviewEnv, PythonEnv
|
| 5 |
+
from .models import (
|
| 6 |
+
PyTorchCodeAnalyzerModel,
|
| 7 |
+
PythonAction,
|
| 8 |
+
PythonCodeReviewAction,
|
| 9 |
+
PythonCodeReviewObservation,
|
| 10 |
+
PythonCodeReviewState,
|
| 11 |
+
PythonObservation,
|
| 12 |
+
PythonState,
|
| 13 |
+
)
|
| 14 |
+
from .schemas import AnalyzeCodeRequest, AnalyzeCodeResponse
|
| 15 |
+
from .services import AnalysisService
|
| 16 |
+
from .triage import CodeTriageEngine, HashingEmbeddingBackend, TransformersEmbeddingBackend, get_default_engine
|
| 17 |
+
from .triage_models import TriageResult
|
| 18 |
+
except ImportError: # pragma: no cover
|
| 19 |
+
from client import PythonCodeReviewEnv, PythonEnv
|
| 20 |
+
from models import (
|
| 21 |
+
PyTorchCodeAnalyzerModel,
|
| 22 |
+
PythonAction,
|
| 23 |
+
PythonCodeReviewAction,
|
| 24 |
+
PythonCodeReviewObservation,
|
| 25 |
+
PythonCodeReviewState,
|
| 26 |
+
PythonObservation,
|
| 27 |
+
PythonState,
|
| 28 |
+
)
|
| 29 |
+
from schemas import AnalyzeCodeRequest, AnalyzeCodeResponse
|
| 30 |
+
from services import AnalysisService
|
| 31 |
+
from triage import CodeTriageEngine, HashingEmbeddingBackend, TransformersEmbeddingBackend, get_default_engine
|
| 32 |
+
from triage_models import TriageResult
|
| 33 |
|
| 34 |
__all__ = [
|
| 35 |
"PythonAction",
|
analyzers/ds_analyzer.py
CHANGED
|
@@ -15,13 +15,14 @@ def analyze_data_science_code(code: str, parsed: Dict[str, Any], complexity: Dic
|
|
| 15 |
score = 0.72
|
| 16 |
|
| 17 |
if "iterrows(" in code or "itertuples(" in code:
|
| 18 |
-
issues.append(
|
| 19 |
-
AnalysisIssue(
|
| 20 |
-
title="Row-wise dataframe iteration detected",
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
suggestions.append("Use vectorized pandas or numpy expressions instead of row-wise iteration.")
|
| 26 |
score -= 0.18
|
| 27 |
|
|
@@ -30,13 +31,14 @@ def analyze_data_science_code(code: str, parsed: Dict[str, Any], complexity: Dic
|
|
| 30 |
score -= 0.05
|
| 31 |
|
| 32 |
if "fit_transform(" in code and "train_test_split" not in code:
|
| 33 |
-
issues.append(
|
| 34 |
-
AnalysisIssue(
|
| 35 |
-
title="Potential data leakage risk",
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
suggestions.append("Split train and validation data before fitting stateful preprocessing steps.")
|
| 41 |
score -= 0.2
|
| 42 |
|
|
|
|
| 15 |
score = 0.72
|
| 16 |
|
| 17 |
if "iterrows(" in code or "itertuples(" in code:
|
| 18 |
+
issues.append(
|
| 19 |
+
AnalysisIssue(
|
| 20 |
+
title="Row-wise dataframe iteration detected",
|
| 21 |
+
category="performance",
|
| 22 |
+
severity="medium",
|
| 23 |
+
description="Looping through dataframe rows is usually slower and less scalable than vectorized operations.",
|
| 24 |
+
)
|
| 25 |
+
)
|
| 26 |
suggestions.append("Use vectorized pandas or numpy expressions instead of row-wise iteration.")
|
| 27 |
score -= 0.18
|
| 28 |
|
|
|
|
| 31 |
score -= 0.05
|
| 32 |
|
| 33 |
if "fit_transform(" in code and "train_test_split" not in code:
|
| 34 |
+
issues.append(
|
| 35 |
+
AnalysisIssue(
|
| 36 |
+
title="Potential data leakage risk",
|
| 37 |
+
category="correctness",
|
| 38 |
+
severity="high",
|
| 39 |
+
description="Feature transforms appear before an explicit train/test split.",
|
| 40 |
+
)
|
| 41 |
+
)
|
| 42 |
suggestions.append("Split train and validation data before fitting stateful preprocessing steps.")
|
| 43 |
score -= 0.2
|
| 44 |
|
analyzers/dsa_analyzer.py
CHANGED
|
@@ -15,13 +15,14 @@ def analyze_dsa_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, An
|
|
| 15 |
score = 0.7
|
| 16 |
|
| 17 |
if parsed.get("max_loop_depth", 0) >= 2:
|
| 18 |
-
issues.append(
|
| 19 |
-
AnalysisIssue(
|
| 20 |
-
title="Nested loops suggest brute-force behavior",
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
suggestions.append("Consider replacing nested scans with a hashmap, prefix table, or sorted search strategy.")
|
| 26 |
score -= 0.15
|
| 27 |
|
|
|
|
| 15 |
score = 0.7
|
| 16 |
|
| 17 |
if parsed.get("max_loop_depth", 0) >= 2:
|
| 18 |
+
issues.append(
|
| 19 |
+
AnalysisIssue(
|
| 20 |
+
title="Nested loops suggest brute-force behavior",
|
| 21 |
+
category="performance",
|
| 22 |
+
severity="medium",
|
| 23 |
+
description="The implementation scans the input multiple times, which is often avoidable in DSA problems.",
|
| 24 |
+
)
|
| 25 |
+
)
|
| 26 |
suggestions.append("Consider replacing nested scans with a hashmap, prefix table, or sorted search strategy.")
|
| 27 |
score -= 0.15
|
| 28 |
|
analyzers/ml_analyzer.py
CHANGED
|
@@ -15,13 +15,14 @@ def analyze_ml_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any
|
|
| 15 |
score = 0.74
|
| 16 |
|
| 17 |
if "torch" in code and "model.eval()" not in code and "predict" in code.lower():
|
| 18 |
-
issues.append(
|
| 19 |
-
AnalysisIssue(
|
| 20 |
-
title="Inference path may be missing eval mode",
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
suggestions.append("Call model.eval() before inference to disable training-time behavior such as dropout.")
|
| 26 |
score -= 0.18
|
| 27 |
|
|
@@ -30,13 +31,14 @@ def analyze_ml_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, Any
|
|
| 30 |
score -= 0.12
|
| 31 |
|
| 32 |
if parsed.get("calls_backward") and not parsed.get("calls_optimizer_step"):
|
| 33 |
-
issues.append(
|
| 34 |
-
AnalysisIssue(
|
| 35 |
-
title="Backward pass without optimizer step",
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
suggestions.append("Ensure optimizer.step() and optimizer.zero_grad() are placed correctly in the training loop.")
|
| 41 |
score -= 0.12
|
| 42 |
|
|
|
|
| 15 |
score = 0.74
|
| 16 |
|
| 17 |
if "torch" in code and "model.eval()" not in code and "predict" in code.lower():
|
| 18 |
+
issues.append(
|
| 19 |
+
AnalysisIssue(
|
| 20 |
+
title="Inference path may be missing eval mode",
|
| 21 |
+
category="correctness",
|
| 22 |
+
severity="high",
|
| 23 |
+
description="Inference code should place the model in eval mode before prediction.",
|
| 24 |
+
)
|
| 25 |
+
)
|
| 26 |
suggestions.append("Call model.eval() before inference to disable training-time behavior such as dropout.")
|
| 27 |
score -= 0.18
|
| 28 |
|
|
|
|
| 31 |
score -= 0.12
|
| 32 |
|
| 33 |
if parsed.get("calls_backward") and not parsed.get("calls_optimizer_step"):
|
| 34 |
+
issues.append(
|
| 35 |
+
AnalysisIssue(
|
| 36 |
+
title="Backward pass without optimizer step",
|
| 37 |
+
category="correctness",
|
| 38 |
+
severity="medium",
|
| 39 |
+
description="Gradients are computed, but the optimizer step is not obvious in the snippet.",
|
| 40 |
+
)
|
| 41 |
+
)
|
| 42 |
suggestions.append("Ensure optimizer.step() and optimizer.zero_grad() are placed correctly in the training loop.")
|
| 43 |
score -= 0.12
|
| 44 |
|
analyzers/web_analyzer.py
CHANGED
|
@@ -16,13 +16,14 @@ def analyze_web_code(code: str, parsed: Dict[str, Any], complexity: Dict[str, An
|
|
| 16 |
|
| 17 |
route_decorators = set(parsed.get("route_decorators", []))
|
| 18 |
if route_decorators and not parsed.get("uses_pydantic"):
|
| 19 |
-
issues.append(
|
| 20 |
-
AnalysisIssue(
|
| 21 |
-
title="Request validation model is missing",
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
| 26 |
suggestions.append("Add Pydantic request and response models for strict validation and type-safe contracts.")
|
| 27 |
score -= 0.2
|
| 28 |
|
|
|
|
| 16 |
|
| 17 |
route_decorators = set(parsed.get("route_decorators", []))
|
| 18 |
if route_decorators and not parsed.get("uses_pydantic"):
|
| 19 |
+
issues.append(
|
| 20 |
+
AnalysisIssue(
|
| 21 |
+
title="Request validation model is missing",
|
| 22 |
+
category="security",
|
| 23 |
+
severity="high",
|
| 24 |
+
description="Route handlers appear present, but no obvious Pydantic validation layer was detected.",
|
| 25 |
+
)
|
| 26 |
+
)
|
| 27 |
suggestions.append("Add Pydantic request and response models for strict validation and type-safe contracts.")
|
| 28 |
score -= 0.2
|
| 29 |
|
api/main.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""FastAPI backend for the
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -9,7 +9,7 @@ from schemas.response import AnalyzeCodeResponse
|
|
| 9 |
from services.analysis_service import AnalysisService
|
| 10 |
|
| 11 |
|
| 12 |
-
app = FastAPI(title="
|
| 13 |
analysis_service = AnalysisService()
|
| 14 |
|
| 15 |
|
|
@@ -21,7 +21,7 @@ def health() -> dict[str, str]:
|
|
| 21 |
|
| 22 |
|
| 23 |
@app.post("/analyze", response_model=AnalyzeCodeResponse)
|
| 24 |
-
def analyze_code(payload: AnalyzeCodeRequest) -> AnalyzeCodeResponse:
|
| 25 |
-
"""Analyze code
|
| 26 |
-
|
| 27 |
-
return analysis_service.analyze(payload)
|
|
|
|
| 1 |
+
"""FastAPI backend for the AI-powered Python code review platform."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
| 9 |
from services.analysis_service import AnalysisService
|
| 10 |
|
| 11 |
|
| 12 |
+
app = FastAPI(title="TorchReview Copilot API", version="3.0.0")
|
| 13 |
analysis_service = AnalysisService()
|
| 14 |
|
| 15 |
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
@app.post("/analyze", response_model=AnalyzeCodeResponse)
|
| 24 |
+
def analyze_code(payload: AnalyzeCodeRequest) -> AnalyzeCodeResponse:
|
| 25 |
+
"""Analyze Python code and return review scores, suggestions, and reward signals."""
|
| 26 |
+
|
| 27 |
+
return analysis_service.analyze(payload)
|
app/env/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
"""
|
| 2 |
|
| 3 |
-
from .runner import main
|
| 4 |
|
| 5 |
-
__all__ = ["main"]
|
|
|
|
| 1 |
+
"""OpenEnv inference runtime package."""
|
| 2 |
|
| 3 |
+
from .runner import InferenceRunner, main
|
| 4 |
|
| 5 |
+
__all__ = ["InferenceRunner", "main"]
|
app/env/runner.py
CHANGED
|
@@ -1,25 +1,14 @@
|
|
| 1 |
-
"""Strict
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
| 5 |
from typing import Any
|
| 6 |
|
| 7 |
-
from compat import install_openenv_fastmcp_compat
|
| 8 |
-
|
| 9 |
from app.agents.review_agent import ReviewAgent
|
| 10 |
-
from app.models.inference import
|
| 11 |
from app.services.openai_service import OpenAIActionPlanner
|
| 12 |
-
from app.utils.runtime import
|
| 13 |
-
compact_text,
|
| 14 |
-
format_bool,
|
| 15 |
-
format_error,
|
| 16 |
-
format_reward,
|
| 17 |
-
observation_attr,
|
| 18 |
-
parse_task_ids,
|
| 19 |
-
suppress_output,
|
| 20 |
-
)
|
| 21 |
-
|
| 22 |
-
install_openenv_fastmcp_compat()
|
| 23 |
|
| 24 |
try:
|
| 25 |
from models import PythonCodeReviewAction
|
|
@@ -30,107 +19,71 @@ except ImportError: # pragma: no cover
|
|
| 30 |
|
| 31 |
|
| 32 |
class InferenceRunner:
|
| 33 |
-
"""
|
| 34 |
|
| 35 |
def __init__(self, config: InferenceConfig) -> None:
|
| 36 |
self.config = config
|
| 37 |
self.agent = ReviewAgent(OpenAIActionPlanner(config))
|
| 38 |
|
| 39 |
-
def
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
| 43 |
|
| 44 |
-
|
| 45 |
rewards: list[str] = []
|
| 46 |
-
|
| 47 |
success = False
|
| 48 |
-
fatal_error: str | None = None
|
| 49 |
-
|
| 50 |
-
self._emit_start(task_name)
|
| 51 |
|
|
|
|
| 52 |
try:
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
1,
|
| 58 |
-
min(
|
| 59 |
-
self.config.max_episode_steps,
|
| 60 |
-
int(observation_attr(observation, "attempts_remaining", self.config.max_episode_steps) or self.config.max_episode_steps),
|
| 61 |
-
),
|
| 62 |
-
)
|
| 63 |
-
while not done and step_count < max_steps:
|
| 64 |
decision = self.agent.act(observation)
|
| 65 |
-
|
| 66 |
-
|
|
|
|
| 67 |
rewards.append(format_reward(reward))
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
except Exception as exc:
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
finally:
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
with suppress_output():
|
| 85 |
-
return env.reset(task_id=task_name)
|
| 86 |
-
|
| 87 |
-
def _step_env(
|
| 88 |
-
self,
|
| 89 |
-
env: PythonCodeReviewEnvironment,
|
| 90 |
-
decision: AgentDecision,
|
| 91 |
-
) -> tuple[Any, float, bool, dict[str, Any]]:
|
| 92 |
-
action = PythonCodeReviewAction(action_type=decision.action_type, code=decision.code)
|
| 93 |
-
with suppress_output():
|
| 94 |
-
observation, reward, done, info = env.step_result(action)
|
| 95 |
-
return observation, float(reward), bool(done), dict(info or {})
|
| 96 |
-
|
| 97 |
-
def _resolve_step_error(
|
| 98 |
-
self,
|
| 99 |
-
info: dict[str, Any],
|
| 100 |
-
observation: Any,
|
| 101 |
-
decision: AgentDecision,
|
| 102 |
-
) -> str | None:
|
| 103 |
-
env_error = compact_text(
|
| 104 |
-
info.get("last_action_error") or observation_attr(observation, "last_action_error", None),
|
| 105 |
-
default="",
|
| 106 |
-
)
|
| 107 |
-
if env_error:
|
| 108 |
-
return env_error
|
| 109 |
-
if decision.error:
|
| 110 |
-
return compact_text(decision.error, default="")
|
| 111 |
-
return None
|
| 112 |
-
|
| 113 |
-
def _emit_start(self, task_name: str) -> None:
|
| 114 |
-
print(
|
| 115 |
-
f"[START] task={task_name} env={self.config.benchmark_name} model={self.config.model_name}",
|
| 116 |
-
flush=True,
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
def _emit_step(self, step_count: int, action: str, reward: float, done: bool, error: str | None) -> None:
|
| 120 |
-
print(
|
| 121 |
-
f"[STEP] step={step_count} action={compact_text(action, default='analyze_code')} "
|
| 122 |
-
f"reward={format_reward(reward)} done={format_bool(done)} error={format_error(error)}",
|
| 123 |
-
flush=True,
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
def _emit_end(self, *, success: bool, step_count: int, rewards: list[str]) -> None:
|
| 127 |
-
print(
|
| 128 |
-
f"[END] success={format_bool(success)} steps={step_count} rewards={','.join(rewards)}",
|
| 129 |
-
flush=True,
|
| 130 |
-
)
|
| 131 |
|
| 132 |
|
| 133 |
def main() -> int:
|
| 134 |
-
"""
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Strict OpenEnv inference runner for TorchReview Copilot."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
import os
|
| 6 |
from typing import Any
|
| 7 |
|
|
|
|
|
|
|
| 8 |
from app.agents.review_agent import ReviewAgent
|
| 9 |
+
from app.models.inference import InferenceConfig
|
| 10 |
from app.services.openai_service import OpenAIActionPlanner
|
| 11 |
+
from app.utils.runtime import format_bool, format_error, format_reward, parse_task_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
try:
|
| 14 |
from models import PythonCodeReviewAction
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
class InferenceRunner:
|
| 22 |
+
"""Execute one OpenEnv episode and emit the required stdout contract."""
|
| 23 |
|
| 24 |
def __init__(self, config: InferenceConfig) -> None:
|
| 25 |
self.config = config
|
| 26 |
self.agent = ReviewAgent(OpenAIActionPlanner(config))
|
| 27 |
|
| 28 |
+
def _create_env(self) -> PythonCodeReviewEnvironment:
|
| 29 |
+
return PythonCodeReviewEnvironment(verbose=False)
|
| 30 |
+
|
| 31 |
+
def run_task(self, task_id: str) -> int:
|
| 32 |
+
"""Run one task and print strict [START]/[STEP]/[END] lines."""
|
| 33 |
|
| 34 |
+
env = self._create_env()
|
| 35 |
rewards: list[str] = []
|
| 36 |
+
steps = 0
|
| 37 |
success = False
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
print(f"[START] task={task_id} env={self.config.benchmark_name} model={self.config.model_name}")
|
| 40 |
try:
|
| 41 |
+
observation = env.reset(task_id=task_id)
|
| 42 |
+
done = bool(getattr(observation, "done", False))
|
| 43 |
+
|
| 44 |
+
while not done and steps < self.config.max_episode_steps:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
decision = self.agent.act(observation)
|
| 46 |
+
action = PythonCodeReviewAction(action_type=decision.action_type, code=decision.code)
|
| 47 |
+
observation, reward, done, info = env.step_result(action)
|
| 48 |
+
steps += 1
|
| 49 |
rewards.append(format_reward(reward))
|
| 50 |
+
error_value = info.get("last_action_error") if isinstance(info, dict) else None
|
| 51 |
+
if error_value is None:
|
| 52 |
+
error_value = getattr(observation, "last_action_error", None)
|
| 53 |
+
print(
|
| 54 |
+
f"[STEP] step={steps} action={decision.action_type} "
|
| 55 |
+
f"reward={format_reward(reward)} done={format_bool(done)} error={format_error(error_value)}"
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
final_score = float(getattr(observation, "score", 0.0))
|
| 59 |
+
success = bool(done and final_score >= self.config.success_threshold)
|
| 60 |
+
return 0 if success else 1
|
| 61 |
except Exception as exc:
|
| 62 |
+
if steps == 0:
|
| 63 |
+
print(
|
| 64 |
+
f"[STEP] step=1 action=bootstrap reward=0.00 done=true "
|
| 65 |
+
f"error={format_error(f'{type(exc).__name__}: {exc}')}"
|
| 66 |
+
)
|
| 67 |
+
rewards.append("0.00")
|
| 68 |
+
steps = 1
|
| 69 |
+
return 1
|
| 70 |
finally:
|
| 71 |
+
try:
|
| 72 |
+
close_method = getattr(env, "close", None)
|
| 73 |
+
if callable(close_method):
|
| 74 |
+
close_method()
|
| 75 |
+
except Exception:
|
| 76 |
+
pass
|
| 77 |
+
print(f"[END] success={format_bool(success)} steps={steps} rewards={','.join(rewards)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
def main() -> int:
|
| 81 |
+
"""Run a single validator episode using environment defaults."""
|
| 82 |
+
|
| 83 |
+
config = InferenceConfig.from_env()
|
| 84 |
+
task_id = (
|
| 85 |
+
str(os.getenv("OPENENV_TASK_ID") or os.getenv("TASK_ID") or "").strip()
|
| 86 |
+
or parse_task_ids()[0]
|
| 87 |
+
)
|
| 88 |
+
runner = InferenceRunner(config)
|
| 89 |
+
return runner.run_task(task_id)
|
app/examples.py
CHANGED
|
@@ -1,31 +1,31 @@
|
|
| 1 |
-
"""Example snippets for
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
EXAMPLES = {
|
| 7 |
-
"
|
| 8 |
-
"domain_hint": "dsa",
|
| 9 |
-
"context_window": "
|
| 10 |
-
"traceback_text": "",
|
| 11 |
-
"code": """def
|
| 12 |
-
},
|
| 13 |
-
"
|
| 14 |
-
"domain_hint": "
|
| 15 |
-
"context_window": "
|
| 16 |
-
"traceback_text": "",
|
| 17 |
-
"code": """
|
| 18 |
-
},
|
| 19 |
-
"ML
|
| 20 |
-
"domain_hint": "ml_dl",
|
| 21 |
-
"context_window": "
|
| 22 |
-
"traceback_text": "",
|
| 23 |
-
"code": """import torch\n\nclass Predictor:\n def __init__(self, model):\n self.model = model\n\n def predict(self, batch):\n outputs = self.model(batch)\n return outputs.argmax(dim=1)\n""",
|
| 24 |
-
},
|
| 25 |
-
"
|
| 26 |
-
"domain_hint": "web",
|
| 27 |
-
"context_window": "Backend endpoint for creating review tasks from user-submitted payloads.",
|
| 28 |
-
"traceback_text": "",
|
| 29 |
-
"code": """from fastapi import FastAPI, Request\n\napp = FastAPI()\n\n@app.post('/tasks')\ndef create_task(request: Request):\n payload = request.json()\n return {'task': payload}\n""",
|
| 30 |
-
},
|
| 31 |
-
}
|
|
|
|
| 1 |
+
"""Example snippets for the code review UI."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
EXAMPLES = {
|
| 7 |
+
"Boundary Bug": {
|
| 8 |
+
"domain_hint": "dsa",
|
| 9 |
+
"context_window": "Analytics helper that groups sorted events into session windows.",
|
| 10 |
+
"traceback_text": "AssertionError: expected [(1, 3), (8, 8)] but got [(1, 8)] on the boundary case.",
|
| 11 |
+
"code": """def collapse_sessions(events, idle_timeout_minutes):\n if not events:\n return []\n\n sessions = []\n current_start = events[0]['minute']\n current_end = current_start\n\n for event in events[1:]:\n minute = event['minute']\n if minute - current_end > idle_timeout_minutes:\n sessions.append((current_start, current_end))\n current_start = minute\n current_end = minute\n\n return sessions\n""",
|
| 12 |
+
},
|
| 13 |
+
"Performance Hotspot": {
|
| 14 |
+
"domain_hint": "dsa",
|
| 15 |
+
"context_window": "Nightly export job running on a small CPU box with rising traffic volume.",
|
| 16 |
+
"traceback_text": "BenchmarkWarning: function exceeded latency budget due to repeated full-list scans.",
|
| 17 |
+
"code": """def rank_active_users(events):\n users = []\n for event in events:\n if event['status'] == 'active':\n found = False\n for existing in users:\n if existing == event['user_id']:\n found = True\n if not found:\n users.append(event['user_id'])\n\n totals = []\n for user in users:\n count = 0\n for event in events:\n if event['status'] == 'active' and event['user_id'] == user:\n count += 1\n totals.append((user, count))\n\n totals.sort(key=lambda item: (-item[1], item[0]))\n return totals\n""",
|
| 18 |
+
},
|
| 19 |
+
"ML Inference": {
|
| 20 |
+
"domain_hint": "ml_dl",
|
| 21 |
+
"context_window": "Batch inference helper for a PyTorch image classifier.",
|
| 22 |
+
"traceback_text": "",
|
| 23 |
+
"code": """import torch\n\nclass Predictor:\n def __init__(self, model):\n self.model = model\n\n def predict(self, batch):\n outputs = self.model(batch)\n return outputs.argmax(dim=1)\n""",
|
| 24 |
+
},
|
| 25 |
+
"FastAPI Endpoint": {
|
| 26 |
+
"domain_hint": "web",
|
| 27 |
+
"context_window": "Backend endpoint for creating review tasks from user-submitted payloads.",
|
| 28 |
+
"traceback_text": "",
|
| 29 |
+
"code": """from fastapi import FastAPI, Request\n\napp = FastAPI()\n\n@app.post('/tasks')\ndef create_task(request: Request):\n payload = request.json()\n return {'task': payload}\n""",
|
| 30 |
+
},
|
| 31 |
+
}
|
app/models/inference.py
CHANGED
|
@@ -11,25 +11,38 @@ DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
|
|
| 11 |
DEFAULT_BENCHMARK_NAME = "python_code_review_env"
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
@dataclass(slots=True)
|
| 15 |
class InferenceConfig:
|
| 16 |
"""Runtime configuration loaded from environment variables."""
|
| 17 |
|
| 18 |
api_base_url: str
|
| 19 |
model_name: str
|
| 20 |
-
|
| 21 |
-
benchmark_name: str = DEFAULT_BENCHMARK_NAME
|
| 22 |
-
request_timeout_s: float = 12.0
|
| 23 |
-
max_retries: int = 2
|
| 24 |
-
max_episode_steps: int = 12
|
| 25 |
-
success_threshold: float = 0.
|
| 26 |
|
| 27 |
@classmethod
|
| 28 |
def from_env(cls) -> "InferenceConfig":
|
|
|
|
| 29 |
return cls(
|
| 30 |
-
api_base_url=
|
| 31 |
model_name=str(os.getenv("MODEL_NAME") or DEFAULT_MODEL_NAME),
|
| 32 |
-
|
| 33 |
benchmark_name=str(os.getenv("OPENENV_BENCHMARK") or DEFAULT_BENCHMARK_NAME),
|
| 34 |
)
|
| 35 |
|
|
|
|
| 11 |
DEFAULT_BENCHMARK_NAME = "python_code_review_env"
|
| 12 |
|
| 13 |
|
| 14 |
+
def _resolve_api_key(api_base_url: str) -> str:
|
| 15 |
+
"""Choose the correct provider token for the configured endpoint."""
|
| 16 |
+
|
| 17 |
+
normalized = api_base_url.strip().lower()
|
| 18 |
+
hf_token = str(os.getenv("HF_TOKEN") or "").strip()
|
| 19 |
+
openai_api_key = str(os.getenv("OPENAI_API_KEY") or "").strip()
|
| 20 |
+
|
| 21 |
+
if "api.openai.com" in normalized:
|
| 22 |
+
return openai_api_key or hf_token
|
| 23 |
+
return hf_token or openai_api_key
|
| 24 |
+
|
| 25 |
+
|
| 26 |
@dataclass(slots=True)
|
| 27 |
class InferenceConfig:
|
| 28 |
"""Runtime configuration loaded from environment variables."""
|
| 29 |
|
| 30 |
api_base_url: str
|
| 31 |
model_name: str
|
| 32 |
+
api_key: str
|
| 33 |
+
benchmark_name: str = DEFAULT_BENCHMARK_NAME
|
| 34 |
+
request_timeout_s: float = 12.0
|
| 35 |
+
max_retries: int = 2
|
| 36 |
+
max_episode_steps: int = 12
|
| 37 |
+
success_threshold: float = 0.88
|
| 38 |
|
| 39 |
@classmethod
|
| 40 |
def from_env(cls) -> "InferenceConfig":
|
| 41 |
+
api_base_url = str(os.getenv("API_BASE_URL") or DEFAULT_API_BASE_URL)
|
| 42 |
return cls(
|
| 43 |
+
api_base_url=api_base_url,
|
| 44 |
model_name=str(os.getenv("MODEL_NAME") or DEFAULT_MODEL_NAME),
|
| 45 |
+
api_key=_resolve_api_key(api_base_url),
|
| 46 |
benchmark_name=str(os.getenv("OPENENV_BENCHMARK") or DEFAULT_BENCHMARK_NAME),
|
| 47 |
)
|
| 48 |
|
app/services/openai_service.py
CHANGED
|
@@ -20,11 +20,15 @@ class OpenAIActionPlanner:
|
|
| 20 |
|
| 21 |
def __init__(self, config: InferenceConfig) -> None:
|
| 22 |
self.config = config
|
| 23 |
-
self.client =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def propose_action(self, observation: Any) -> AgentDecision:
|
| 26 |
if self.client is None:
|
| 27 |
-
return AgentDecision(action_type="run_tests", source="fallback", error="
|
| 28 |
|
| 29 |
prompt = self._build_prompt(observation)
|
| 30 |
for attempt in range(self.config.max_retries + 1):
|
|
|
|
| 20 |
|
| 21 |
def __init__(self, config: InferenceConfig) -> None:
|
| 22 |
self.config = config
|
| 23 |
+
self.client = (
|
| 24 |
+
OpenAI(base_url=config.api_base_url, api_key=config.api_key, timeout=config.request_timeout_s)
|
| 25 |
+
if config.api_key
|
| 26 |
+
else None
|
| 27 |
+
)
|
| 28 |
|
| 29 |
def propose_action(self, observation: Any) -> AgentDecision:
|
| 30 |
if self.client is None:
|
| 31 |
+
return AgentDecision(action_type="run_tests", source="fallback", error="API key missing")
|
| 32 |
|
| 33 |
prompt = self._build_prompt(observation)
|
| 34 |
for attempt in range(self.config.max_retries + 1):
|
app/streamlit_app.py
CHANGED
|
@@ -1,44 +1,75 @@
|
|
| 1 |
-
"""Streamlit frontend for the
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import streamlit as st
|
| 6 |
|
| 7 |
from app.examples import EXAMPLES
|
| 8 |
from schemas.request import AnalyzeCodeRequest
|
| 9 |
from services.analysis_service import AnalysisService
|
| 10 |
|
| 11 |
|
| 12 |
-
analysis_service = AnalysisService()
|
| 13 |
|
| 14 |
|
| 15 |
-
def _analyze(code: str, context_window: str, traceback_text: str, domain_hint: str):
|
| 16 |
-
"""Run the analysis service with validated request payloads."""
|
| 17 |
|
| 18 |
request = AnalyzeCodeRequest(
|
| 19 |
code=code,
|
| 20 |
context_window=context_window,
|
| 21 |
traceback_text=traceback_text,
|
| 22 |
domain_hint=domain_hint, # type: ignore[arg-type]
|
| 23 |
-
)
|
| 24 |
-
return analysis_service.analyze(request)
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def
|
| 28 |
-
"""
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
traceback_text = st.text_area("Optional traceback / runtime hint", value=example["traceback_text"], height=100)
|
| 43 |
domain_hint = st.selectbox("Domain hint", ["auto", "dsa", "data_science", "ml_dl", "web"], index=["auto", "dsa", "data_science", "ml_dl", "web"].index(example["domain_hint"]))
|
| 44 |
analyze_clicked = st.button("Analyze Code", type="primary")
|
|
@@ -47,53 +78,83 @@ def main() -> None:
|
|
| 47 |
if code and (analyze_clicked or auto_analyze):
|
| 48 |
result = _analyze(code, context_window, traceback_text, domain_hint)
|
| 49 |
|
| 50 |
-
with right:
|
| 51 |
-
if result is None:
|
| 52 |
-
st.info("Paste code or load an example to start analysis.")
|
| 53 |
-
else:
|
| 54 |
-
metric_cols = st.columns(4)
|
| 55 |
-
metric_cols[0].metric("Detected domain", result.detected_domain)
|
| 56 |
-
metric_cols[1].metric("ML score", f"{result.score_breakdown.ml_score:.0%}")
|
| 57 |
-
metric_cols[2].metric("
|
| 58 |
-
metric_cols[3].metric("Reward", f"{result.score_breakdown.reward:.0%}")
|
| 59 |
-
st.
|
| 60 |
-
st.
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
st.
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
st.
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
st.subheader("
|
| 91 |
-
st.
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
|
| 99 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
"""Streamlit frontend for the AI-powered Python code review platform."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import streamlit as st
|
| 6 |
|
| 7 |
from app.examples import EXAMPLES
|
| 8 |
from schemas.request import AnalyzeCodeRequest
|
| 9 |
from services.analysis_service import AnalysisService
|
| 10 |
|
| 11 |
|
| 12 |
+
analysis_service = AnalysisService()
|
| 13 |
|
| 14 |
|
| 15 |
+
def _analyze(code: str, context_window: str, traceback_text: str, domain_hint: str):
|
| 16 |
+
"""Run the analysis service with validated request payloads."""
|
| 17 |
|
| 18 |
request = AnalyzeCodeRequest(
|
| 19 |
code=code,
|
| 20 |
context_window=context_window,
|
| 21 |
traceback_text=traceback_text,
|
| 22 |
domain_hint=domain_hint, # type: ignore[arg-type]
|
| 23 |
+
)
|
| 24 |
+
return analysis_service.analyze(request)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _score_chart_data(result) -> dict[str, float]:
|
| 28 |
+
"""Prepare the most useful score signals for visual display."""
|
| 29 |
+
|
| 30 |
+
return {
|
| 31 |
+
"reward": result.score_breakdown.reward,
|
| 32 |
+
"ml_quality": result.score_breakdown.ml_score,
|
| 33 |
+
"lint": result.score_breakdown.lint_score,
|
| 34 |
+
"maintainability": result.score_breakdown.maintainability_score,
|
| 35 |
+
"readability": result.score_breakdown.readability_score,
|
| 36 |
+
"security": result.score_breakdown.security_score,
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main() -> None:
|
| 41 |
+
"""Render the Streamlit UI."""
|
| 42 |
+
|
| 43 |
+
st.set_page_config(page_title="TorchReview Copilot", layout="wide")
|
| 44 |
+
st.title("TorchReview Copilot")
|
| 45 |
+
st.caption(
|
| 46 |
+
"AI-powered Python code review with static analysis, PyTorch scoring, "
|
| 47 |
+
"RL-ready rewards, and actionable code-improvement guidance."
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
with st.sidebar:
|
| 51 |
+
st.subheader("Review Pipeline")
|
| 52 |
+
st.markdown(
|
| 53 |
+
"\n".join(
|
| 54 |
+
[
|
| 55 |
+
"1. Input Python code",
|
| 56 |
+
"2. Parse AST + estimate complexity",
|
| 57 |
+
"3. Score with a PyTorch encoder",
|
| 58 |
+
"4. Generate suggestions and auto-fix hints",
|
| 59 |
+
"5. Compute an RL-ready reward",
|
| 60 |
+
]
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
example_name = st.selectbox("Example input", list(EXAMPLES.keys()))
|
| 64 |
+
auto_analyze = st.toggle("Real-time scoring", value=True)
|
| 65 |
+
st.info("The PyTorch layer uses CodeBERTa embeddings when weights are available, with a torch-native fallback for offline demos.")
|
| 66 |
+
|
| 67 |
+
example = EXAMPLES[example_name]
|
| 68 |
+
|
| 69 |
+
left, right = st.columns([1.2, 1.0])
|
| 70 |
+
with left:
|
| 71 |
+
code = st.text_area("Code input", value=example["code"], height=420)
|
| 72 |
+
context_window = st.text_area("Context window", value=example["context_window"], height=100)
|
| 73 |
traceback_text = st.text_area("Optional traceback / runtime hint", value=example["traceback_text"], height=100)
|
| 74 |
domain_hint = st.selectbox("Domain hint", ["auto", "dsa", "data_science", "ml_dl", "web"], index=["auto", "dsa", "data_science", "ml_dl", "web"].index(example["domain_hint"]))
|
| 75 |
analyze_clicked = st.button("Analyze Code", type="primary")
|
|
|
|
| 78 |
if code and (analyze_clicked or auto_analyze):
|
| 79 |
result = _analyze(code, context_window, traceback_text, domain_hint)
|
| 80 |
|
| 81 |
+
with right:
|
| 82 |
+
if result is None:
|
| 83 |
+
st.info("Paste code or load an example to start analysis.")
|
| 84 |
+
else:
|
| 85 |
+
metric_cols = st.columns(4)
|
| 86 |
+
metric_cols[0].metric("Detected domain", result.detected_domain)
|
| 87 |
+
metric_cols[1].metric("ML score", f"{result.score_breakdown.ml_score:.0%}")
|
| 88 |
+
metric_cols[2].metric("Lint score", f"{result.score_breakdown.lint_score:.0%}")
|
| 89 |
+
metric_cols[3].metric("Reward", f"{result.score_breakdown.reward:.0%}")
|
| 90 |
+
st.subheader("Domain Confidence")
|
| 91 |
+
st.bar_chart(result.domain_confidences)
|
| 92 |
+
st.subheader("Review Signal Radar")
|
| 93 |
+
st.bar_chart(_score_chart_data(result))
|
| 94 |
+
st.code(
|
| 95 |
+
"reward = 0.50*ml_score + 0.18*lint + 0.12*maintainability "
|
| 96 |
+
"+ 0.10*domain + 0.10*security - 0.20*complexity",
|
| 97 |
+
language="text",
|
| 98 |
+
)
|
| 99 |
+
st.caption(result.summary)
|
| 100 |
+
|
| 101 |
+
if result is not None:
|
| 102 |
+
overview_tab, suggestions_tab, domain_tab, static_tab = st.tabs(
|
| 103 |
+
["Overview", "Suggestions", "Domain Detail", "Static Analysis"]
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
with overview_tab:
|
| 107 |
+
st.subheader("Reward Breakdown")
|
| 108 |
+
st.json(result.score_visualization)
|
| 109 |
+
st.subheader("Top Signals")
|
| 110 |
+
signal_cols = st.columns(3)
|
| 111 |
+
signal_cols[0].progress(result.score_breakdown.quality_signal, text="Quality signal")
|
| 112 |
+
signal_cols[1].progress(result.score_breakdown.error_reduction_signal, text="Error reduction")
|
| 113 |
+
signal_cols[2].progress(result.score_breakdown.completion_signal, text="Completion")
|
| 114 |
+
st.subheader("Improvement Plan")
|
| 115 |
+
for step in result.improvement_plan:
|
| 116 |
+
st.write(f"- {step}")
|
| 117 |
+
if result.auto_fix_preview:
|
| 118 |
+
st.subheader("Auto-Fix Preview")
|
| 119 |
+
for hint in result.auto_fix_preview:
|
| 120 |
+
st.write(f"- {hint}")
|
| 121 |
+
st.subheader("Complexity")
|
| 122 |
+
st.write(
|
| 123 |
+
{
|
| 124 |
+
"time_complexity": result.static_analysis.time_complexity,
|
| 125 |
+
"space_complexity": result.static_analysis.space_complexity,
|
| 126 |
+
"cyclomatic_complexity": result.static_analysis.cyclomatic_complexity,
|
| 127 |
+
"max_nesting_depth": result.static_analysis.max_nesting_depth,
|
| 128 |
+
}
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
with suggestions_tab:
|
| 132 |
+
st.subheader("Suggestions")
|
| 133 |
+
for suggestion in result.suggestions:
|
| 134 |
+
st.write(f"- [{suggestion.priority}] {suggestion.title}: {suggestion.action}")
|
| 135 |
+
if result.domain_analysis.suggestions:
|
| 136 |
+
st.subheader("Domain Hints")
|
| 137 |
+
for item in result.domain_analysis.suggestions:
|
| 138 |
+
st.write(f"- {item}")
|
| 139 |
+
if result.domain_analysis.issues or result.static_analysis.issues:
|
| 140 |
+
st.subheader("Issues")
|
| 141 |
+
for issue in result.domain_analysis.issues + result.static_analysis.issues:
|
| 142 |
+
st.write(f"- [{issue.severity}] {issue.title}: {issue.description}")
|
| 143 |
+
|
| 144 |
+
with domain_tab:
|
| 145 |
+
st.subheader("Domain Highlights")
|
| 146 |
+
st.json(result.domain_analysis.highlights)
|
| 147 |
+
st.write(f"Domain score: {result.domain_analysis.domain_score:.0%}")
|
| 148 |
+
st.write(f"Model label: {result.model_prediction.quality_label}")
|
| 149 |
+
st.write(f"Model backend: `{result.model_backend}`")
|
| 150 |
+
if result.model_prediction.notes:
|
| 151 |
+
st.subheader("Model Notes")
|
| 152 |
+
for note in result.model_prediction.notes:
|
| 153 |
+
st.write(f"- {note}")
|
| 154 |
+
|
| 155 |
+
with static_tab:
|
| 156 |
+
st.subheader("Static Analysis")
|
| 157 |
+
st.json(result.static_analysis.model_dump())
|
| 158 |
|
| 159 |
|
| 160 |
if __name__ == "__main__":
|
app/utils/runtime.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""Formatting, parsing, and IO-suppression helpers for inference."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -7,10 +7,14 @@ from collections.abc import Iterable
|
|
| 7 |
from contextlib import contextmanager, redirect_stderr, redirect_stdout
|
| 8 |
from typing import Any, Iterator
|
| 9 |
|
| 10 |
-
try:
|
| 11 |
-
from tasks import task_ids
|
| 12 |
-
except ImportError: # pragma: no cover
|
| 13 |
-
from python_env.tasks import task_ids # type: ignore[no-redef]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def compact_text(
|
|
@@ -51,21 +55,28 @@ def observation_attr(observation: Any, name: str, default: Any = None, *, preser
|
|
| 51 |
return value
|
| 52 |
|
| 53 |
|
| 54 |
-
def format_bool(value: Any) -> str:
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
def parse_task_ids() -> list[str]:
|
|
|
|
| 1 |
+
"""Formatting, parsing, and IO-suppression helpers for inference."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
| 7 |
from contextlib import contextmanager, redirect_stderr, redirect_stdout
|
| 8 |
from typing import Any, Iterator
|
| 9 |
|
| 10 |
+
try:
|
| 11 |
+
from tasks import task_ids
|
| 12 |
+
except ImportError: # pragma: no cover
|
| 13 |
+
from python_env.tasks import task_ids # type: ignore[no-redef]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
MIN_DISPLAY_REWARD = 0.01
|
| 17 |
+
MAX_DISPLAY_REWARD = 0.99
|
| 18 |
|
| 19 |
|
| 20 |
def compact_text(
|
|
|
|
| 55 |
return value
|
| 56 |
|
| 57 |
|
| 58 |
+
def format_bool(value: Any) -> str:
|
| 59 |
+
"""Render booleans in the lowercase form required by OpenEnv."""
|
| 60 |
+
|
| 61 |
+
return "true" if bool(value) else "false"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def format_reward(value: Any) -> str:
|
| 65 |
+
"""Render rewards in a validator-safe two-decimal open interval."""
|
| 66 |
+
|
| 67 |
+
try:
|
| 68 |
+
reward = float(value)
|
| 69 |
+
except Exception:
|
| 70 |
+
reward = MIN_DISPLAY_REWARD
|
| 71 |
+
reward = max(MIN_DISPLAY_REWARD, min(MAX_DISPLAY_REWARD, reward))
|
| 72 |
+
return f"{reward:.2f}"
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def format_error(value: Any) -> str:
|
| 76 |
+
"""Render nullable error strings in the stdout contract format."""
|
| 77 |
+
|
| 78 |
+
text = compact_text(value, default="")
|
| 79 |
+
return text if text else "null"
|
| 80 |
|
| 81 |
|
| 82 |
def parse_task_ids() -> list[str]:
|
client.py
CHANGED
|
@@ -2,16 +2,23 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from typing import Dict
|
| 6 |
-
|
| 7 |
-
from openenv.core import EnvClient
|
| 8 |
-
from openenv.core.client_types import StepResult
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class PythonCodeReviewEnv(
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from typing import Dict
|
| 6 |
+
|
| 7 |
+
from openenv.core import EnvClient
|
| 8 |
+
from openenv.core.client_types import StepResult
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from .models import (
|
| 12 |
+
PythonCodeReviewAction,
|
| 13 |
+
PythonCodeReviewObservation,
|
| 14 |
+
PythonCodeReviewState,
|
| 15 |
+
)
|
| 16 |
+
except ImportError: # pragma: no cover
|
| 17 |
+
from models import (
|
| 18 |
+
PythonCodeReviewAction,
|
| 19 |
+
PythonCodeReviewObservation,
|
| 20 |
+
PythonCodeReviewState,
|
| 21 |
+
)
|
| 22 |
|
| 23 |
|
| 24 |
class PythonCodeReviewEnv(
|
graders/bug_fix.py
CHANGED
|
@@ -12,10 +12,10 @@ except ImportError:
|
|
| 12 |
from .shared import (
|
| 13 |
base_grade,
|
| 14 |
compile_code,
|
|
|
|
| 15 |
component_score,
|
| 16 |
execute_cases,
|
| 17 |
quality_metrics,
|
| 18 |
-
shaped_score,
|
| 19 |
similarity_score,
|
| 20 |
summarize_results,
|
| 21 |
)
|
|
@@ -32,6 +32,7 @@ def grade_bug_fix_task(
|
|
| 32 |
|
| 33 |
compiled, compile_error = compile_code(code)
|
| 34 |
quality = quality_metrics(code, task.function_name)
|
|
|
|
| 35 |
details = {
|
| 36 |
"compile_error": compile_error,
|
| 37 |
"quality_notes": quality["quality_notes"],
|
|
@@ -40,11 +41,18 @@ def grade_bug_fix_task(
|
|
| 40 |
}
|
| 41 |
|
| 42 |
if not compiled:
|
| 43 |
-
progress = 0.02 + 0.12 * similarity_score(code, task.reference_code)
|
| 44 |
details["test_results"] = []
|
| 45 |
details["test_summary"] = "Code does not compile."
|
| 46 |
return base_grade(
|
| 47 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
syntax_score=component_score(0.01),
|
| 49 |
tests_passed=0,
|
| 50 |
tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
|
|
@@ -59,9 +67,16 @@ def grade_bug_fix_task(
|
|
| 59 |
if result.get("timed_out"):
|
| 60 |
details["test_results"] = []
|
| 61 |
details["test_summary"] = result["error"]
|
| 62 |
-
progress = 0.12 + 0.18 * quality["score"]
|
| 63 |
return base_grade(
|
| 64 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
syntax_score=component_score(0.95),
|
| 66 |
tests_passed=0,
|
| 67 |
tests_total=len(cases),
|
|
@@ -73,9 +88,16 @@ def grade_bug_fix_task(
|
|
| 73 |
if "error" in result:
|
| 74 |
details["test_results"] = []
|
| 75 |
details["test_summary"] = result["error"]
|
| 76 |
-
progress = 0.1 + 0.2 * quality["score"]
|
| 77 |
return base_grade(
|
| 78 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
syntax_score=component_score(0.95),
|
| 80 |
tests_passed=0,
|
| 81 |
tests_total=len(cases),
|
|
@@ -89,9 +111,15 @@ def grade_bug_fix_task(
|
|
| 89 |
pass_rate = data["passed"] / max(data["total"], 1)
|
| 90 |
details["test_results"] = data["results"]
|
| 91 |
details["test_summary"] = summarize_results("Test results", data["results"])
|
| 92 |
-
progress = min(1.0, 0.05 + 0.8 * pass_rate + 0.15 * quality["score"])
|
| 93 |
return base_grade(
|
| 94 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
syntax_score=component_score(0.95),
|
| 96 |
tests_passed=data["passed"],
|
| 97 |
tests_total=data["total"],
|
|
|
|
| 12 |
from .shared import (
|
| 13 |
base_grade,
|
| 14 |
compile_code,
|
| 15 |
+
composite_grade_score,
|
| 16 |
component_score,
|
| 17 |
execute_cases,
|
| 18 |
quality_metrics,
|
|
|
|
| 19 |
similarity_score,
|
| 20 |
summarize_results,
|
| 21 |
)
|
|
|
|
| 32 |
|
| 33 |
compiled, compile_error = compile_code(code)
|
| 34 |
quality = quality_metrics(code, task.function_name)
|
| 35 |
+
similarity = similarity_score(code, task.reference_code)
|
| 36 |
details = {
|
| 37 |
"compile_error": compile_error,
|
| 38 |
"quality_notes": quality["quality_notes"],
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
if not compiled:
|
|
|
|
| 44 |
details["test_results"] = []
|
| 45 |
details["test_summary"] = "Code does not compile."
|
| 46 |
return base_grade(
|
| 47 |
+
score=composite_grade_score(
|
| 48 |
+
correctness=0.0,
|
| 49 |
+
quality=0.05,
|
| 50 |
+
runtime=0.05,
|
| 51 |
+
syntax=0.0,
|
| 52 |
+
similarity=similarity,
|
| 53 |
+
baseline=0.04,
|
| 54 |
+
penalty=0.05,
|
| 55 |
+
),
|
| 56 |
syntax_score=component_score(0.01),
|
| 57 |
tests_passed=0,
|
| 58 |
tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
|
|
|
|
| 67 |
if result.get("timed_out"):
|
| 68 |
details["test_results"] = []
|
| 69 |
details["test_summary"] = result["error"]
|
|
|
|
| 70 |
return base_grade(
|
| 71 |
+
score=composite_grade_score(
|
| 72 |
+
correctness=0.10,
|
| 73 |
+
quality=quality["score"],
|
| 74 |
+
runtime=0.0,
|
| 75 |
+
syntax=0.95,
|
| 76 |
+
similarity=similarity,
|
| 77 |
+
baseline=0.06,
|
| 78 |
+
penalty=0.12,
|
| 79 |
+
),
|
| 80 |
syntax_score=component_score(0.95),
|
| 81 |
tests_passed=0,
|
| 82 |
tests_total=len(cases),
|
|
|
|
| 88 |
if "error" in result:
|
| 89 |
details["test_results"] = []
|
| 90 |
details["test_summary"] = result["error"]
|
|
|
|
| 91 |
return base_grade(
|
| 92 |
+
score=composite_grade_score(
|
| 93 |
+
correctness=0.12,
|
| 94 |
+
quality=quality["score"],
|
| 95 |
+
runtime=0.0,
|
| 96 |
+
syntax=0.95,
|
| 97 |
+
similarity=similarity,
|
| 98 |
+
baseline=0.06,
|
| 99 |
+
penalty=0.08,
|
| 100 |
+
),
|
| 101 |
syntax_score=component_score(0.95),
|
| 102 |
tests_passed=0,
|
| 103 |
tests_total=len(cases),
|
|
|
|
| 111 |
pass_rate = data["passed"] / max(data["total"], 1)
|
| 112 |
details["test_results"] = data["results"]
|
| 113 |
details["test_summary"] = summarize_results("Test results", data["results"])
|
|
|
|
| 114 |
return base_grade(
|
| 115 |
+
score=composite_grade_score(
|
| 116 |
+
correctness=pass_rate,
|
| 117 |
+
quality=quality["score"],
|
| 118 |
+
runtime=0.05,
|
| 119 |
+
syntax=0.95,
|
| 120 |
+
similarity=similarity,
|
| 121 |
+
baseline=0.08,
|
| 122 |
+
),
|
| 123 |
syntax_score=component_score(0.95),
|
| 124 |
tests_passed=data["passed"],
|
| 125 |
tests_total=data["total"],
|
graders/optimization.py
CHANGED
|
@@ -13,10 +13,10 @@ from .shared import (
|
|
| 13 |
base_grade,
|
| 14 |
benchmark_candidate,
|
| 15 |
compile_code,
|
|
|
|
| 16 |
component_score,
|
| 17 |
execute_cases,
|
| 18 |
quality_metrics,
|
| 19 |
-
shaped_score,
|
| 20 |
similarity_score,
|
| 21 |
summarize_results,
|
| 22 |
)
|
|
@@ -33,6 +33,7 @@ def grade_optimization_task(
|
|
| 33 |
|
| 34 |
compiled, compile_error = compile_code(code)
|
| 35 |
quality = quality_metrics(code, task.function_name)
|
|
|
|
| 36 |
details = {
|
| 37 |
"compile_error": compile_error,
|
| 38 |
"quality_notes": quality["quality_notes"],
|
|
@@ -41,11 +42,18 @@ def grade_optimization_task(
|
|
| 41 |
}
|
| 42 |
|
| 43 |
if not compiled:
|
| 44 |
-
progress = 0.02 + 0.1 * similarity_score(code, task.reference_code)
|
| 45 |
details["test_results"] = []
|
| 46 |
details["test_summary"] = "Code does not compile."
|
| 47 |
return base_grade(
|
| 48 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
syntax_score=component_score(0.01),
|
| 50 |
tests_passed=0,
|
| 51 |
tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
|
|
@@ -60,9 +68,16 @@ def grade_optimization_task(
|
|
| 60 |
if result.get("timed_out"):
|
| 61 |
details["test_results"] = []
|
| 62 |
details["test_summary"] = result["error"]
|
| 63 |
-
progress = 0.1 + 0.18 * quality["score"]
|
| 64 |
return base_grade(
|
| 65 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
syntax_score=component_score(0.95),
|
| 67 |
tests_passed=0,
|
| 68 |
tests_total=len(cases),
|
|
@@ -74,9 +89,16 @@ def grade_optimization_task(
|
|
| 74 |
if "error" in result:
|
| 75 |
details["test_results"] = []
|
| 76 |
details["test_summary"] = result["error"]
|
| 77 |
-
progress = 0.1 + 0.2 * quality["score"]
|
| 78 |
return base_grade(
|
| 79 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
syntax_score=component_score(0.95),
|
| 81 |
tests_passed=0,
|
| 82 |
tests_total=len(cases),
|
|
@@ -105,13 +127,16 @@ def grade_optimization_task(
|
|
| 105 |
details["benchmark"] = benchmark_summary
|
| 106 |
|
| 107 |
runtime_progress = 0.0 if benchmark_summary == "Benchmark deferred until hidden evaluation." else runtime_score
|
| 108 |
-
if include_hidden:
|
| 109 |
-
progress = min(1.0, 0.05 + 0.6 * pass_rate + 0.2 * quality["score"] + 0.15 * runtime_progress)
|
| 110 |
-
else:
|
| 111 |
-
progress = min(1.0, 0.05 + 0.7 * pass_rate + 0.25 * quality["score"])
|
| 112 |
-
|
| 113 |
return base_grade(
|
| 114 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
syntax_score=component_score(0.95),
|
| 116 |
tests_passed=data["passed"],
|
| 117 |
tests_total=data["total"],
|
|
|
|
| 13 |
base_grade,
|
| 14 |
benchmark_candidate,
|
| 15 |
compile_code,
|
| 16 |
+
composite_grade_score,
|
| 17 |
component_score,
|
| 18 |
execute_cases,
|
| 19 |
quality_metrics,
|
|
|
|
| 20 |
similarity_score,
|
| 21 |
summarize_results,
|
| 22 |
)
|
|
|
|
| 33 |
|
| 34 |
compiled, compile_error = compile_code(code)
|
| 35 |
quality = quality_metrics(code, task.function_name)
|
| 36 |
+
similarity = similarity_score(code, task.reference_code)
|
| 37 |
details = {
|
| 38 |
"compile_error": compile_error,
|
| 39 |
"quality_notes": quality["quality_notes"],
|
|
|
|
| 42 |
}
|
| 43 |
|
| 44 |
if not compiled:
|
|
|
|
| 45 |
details["test_results"] = []
|
| 46 |
details["test_summary"] = "Code does not compile."
|
| 47 |
return base_grade(
|
| 48 |
+
score=composite_grade_score(
|
| 49 |
+
correctness=0.0,
|
| 50 |
+
quality=0.05,
|
| 51 |
+
runtime=0.0,
|
| 52 |
+
syntax=0.0,
|
| 53 |
+
similarity=similarity,
|
| 54 |
+
baseline=0.04,
|
| 55 |
+
penalty=0.06,
|
| 56 |
+
),
|
| 57 |
syntax_score=component_score(0.01),
|
| 58 |
tests_passed=0,
|
| 59 |
tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
|
|
|
|
| 68 |
if result.get("timed_out"):
|
| 69 |
details["test_results"] = []
|
| 70 |
details["test_summary"] = result["error"]
|
|
|
|
| 71 |
return base_grade(
|
| 72 |
+
score=composite_grade_score(
|
| 73 |
+
correctness=0.08,
|
| 74 |
+
quality=quality["score"],
|
| 75 |
+
runtime=0.0,
|
| 76 |
+
syntax=0.95,
|
| 77 |
+
similarity=similarity,
|
| 78 |
+
baseline=0.05,
|
| 79 |
+
penalty=0.14,
|
| 80 |
+
),
|
| 81 |
syntax_score=component_score(0.95),
|
| 82 |
tests_passed=0,
|
| 83 |
tests_total=len(cases),
|
|
|
|
| 89 |
if "error" in result:
|
| 90 |
details["test_results"] = []
|
| 91 |
details["test_summary"] = result["error"]
|
|
|
|
| 92 |
return base_grade(
|
| 93 |
+
score=composite_grade_score(
|
| 94 |
+
correctness=0.10,
|
| 95 |
+
quality=quality["score"],
|
| 96 |
+
runtime=0.0,
|
| 97 |
+
syntax=0.95,
|
| 98 |
+
similarity=similarity,
|
| 99 |
+
baseline=0.05,
|
| 100 |
+
penalty=0.08,
|
| 101 |
+
),
|
| 102 |
syntax_score=component_score(0.95),
|
| 103 |
tests_passed=0,
|
| 104 |
tests_total=len(cases),
|
|
|
|
| 127 |
details["benchmark"] = benchmark_summary
|
| 128 |
|
| 129 |
runtime_progress = 0.0 if benchmark_summary == "Benchmark deferred until hidden evaluation." else runtime_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
return base_grade(
|
| 131 |
+
score=composite_grade_score(
|
| 132 |
+
correctness=pass_rate,
|
| 133 |
+
quality=quality["score"],
|
| 134 |
+
runtime=runtime_progress if include_hidden else 0.10,
|
| 135 |
+
syntax=0.95,
|
| 136 |
+
similarity=similarity,
|
| 137 |
+
baseline=0.08 if include_hidden else 0.07,
|
| 138 |
+
penalty=0.10 if timed_out else 0.0,
|
| 139 |
+
),
|
| 140 |
syntax_score=component_score(0.95),
|
| 141 |
tests_passed=data["passed"],
|
| 142 |
tests_total=data["total"],
|
graders/shared.py
CHANGED
|
@@ -23,6 +23,7 @@ STRICT_SCORE_MIN = 0.01
|
|
| 23 |
STRICT_SCORE_MAX = 0.99
|
| 24 |
POOR_SCORE = 0.1
|
| 25 |
NEAR_PERFECT_SCORE = 0.95
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
def finite_float(value: Any, fallback: float = STRICT_SCORE_MIN) -> float:
|
|
@@ -44,22 +45,45 @@ def clamp(value: float, lower: float = 0.0, upper: float = 1.0) -> float:
|
|
| 44 |
return max(lower, min(upper, numeric))
|
| 45 |
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def strict_score(value: Any, lower: float = STRICT_SCORE_MIN, upper: float = STRICT_SCORE_MAX) -> float:
|
| 48 |
"""Clamp a score to the OpenEnv-safe open interval (0, 1)."""
|
| 49 |
|
| 50 |
score = max(lower, min(upper, finite_float(value, fallback=lower)))
|
| 51 |
-
score =
|
| 52 |
assert 0 < score < 1, f"Invalid score: {score}"
|
| 53 |
return score
|
| 54 |
|
| 55 |
|
| 56 |
def shaped_score(progress: Any, floor: float = POOR_SCORE, ceiling: float = NEAR_PERFECT_SCORE) -> float:
|
| 57 |
-
"""Map progress in [0, 1] to a
|
| 58 |
|
| 59 |
bounded_progress = clamp(finite_float(progress, fallback=0.0))
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
score =
|
|
|
|
| 63 |
assert 0 < score < 1, f"Invalid score: {score}"
|
| 64 |
return score
|
| 65 |
|
|
@@ -83,7 +107,56 @@ def safe_ratio(numerator: Any, denominator: Any) -> float:
|
|
| 83 |
def component_score(value: Any) -> float:
|
| 84 |
"""Normalize component scores such as syntax, quality, and runtime."""
|
| 85 |
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
def compile_code(code: str) -> tuple[bool, str]:
|
|
@@ -121,23 +194,31 @@ def _queue_worker(
|
|
| 121 |
)
|
| 122 |
|
| 123 |
|
| 124 |
-
def run_with_timeout(
|
| 125 |
-
worker: Callable[[Dict[str, Any]], Dict[str, Any]],
|
| 126 |
-
payload: Dict[str, Any],
|
| 127 |
-
timeout_s: float,
|
| 128 |
-
) -> Dict[str, Any]:
|
| 129 |
-
"""Execute a worker in a subprocess and terminate on timeout.
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
process.
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
if queue.empty():
|
| 143 |
return {"timed_out": False, "error": "Worker exited without returning a result."}
|
|
|
|
| 23 |
STRICT_SCORE_MAX = 0.99
|
| 24 |
POOR_SCORE = 0.1
|
| 25 |
NEAR_PERFECT_SCORE = 0.95
|
| 26 |
+
EPS = 1e-6
|
| 27 |
|
| 28 |
|
| 29 |
def finite_float(value: Any, fallback: float = STRICT_SCORE_MIN) -> float:
|
|
|
|
| 45 |
return max(lower, min(upper, numeric))
|
| 46 |
|
| 47 |
|
| 48 |
+
def safe_score(score: Any) -> float:
|
| 49 |
+
"""Clamp any score to the strict OpenEnv-safe open interval (0, 1)."""
|
| 50 |
+
|
| 51 |
+
bounded = max(EPS, min(1.0 - EPS, finite_float(score, fallback=EPS)))
|
| 52 |
+
assert 0 < bounded < 1, f"Score must be strictly between 0 and 1: {bounded}"
|
| 53 |
+
return bounded
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def normalize_score(x: Any) -> float:
|
| 57 |
+
"""Sigmoid-normalize a raw score and clamp it safely into (0, 1)."""
|
| 58 |
+
|
| 59 |
+
numeric = finite_float(x, fallback=0.0)
|
| 60 |
+
bounded = max(-20.0, min(20.0, numeric))
|
| 61 |
+
return safe_score(1.0 / (1.0 + math.exp(-bounded)))
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def final_score_pipeline(raw_score: Any) -> float:
|
| 65 |
+
"""Normalize arbitrary raw scoring signals into a strict OpenEnv-safe score."""
|
| 66 |
+
|
| 67 |
+
return normalize_score(raw_score)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
def strict_score(value: Any, lower: float = STRICT_SCORE_MIN, upper: float = STRICT_SCORE_MAX) -> float:
|
| 71 |
"""Clamp a score to the OpenEnv-safe open interval (0, 1)."""
|
| 72 |
|
| 73 |
score = max(lower, min(upper, finite_float(value, fallback=lower)))
|
| 74 |
+
score = safe_score(score)
|
| 75 |
assert 0 < score < 1, f"Invalid score: {score}"
|
| 76 |
return score
|
| 77 |
|
| 78 |
|
| 79 |
def shaped_score(progress: Any, floor: float = POOR_SCORE, ceiling: float = NEAR_PERFECT_SCORE) -> float:
|
| 80 |
+
"""Map progress in [0, 1] to a smooth score band within (0, 1)."""
|
| 81 |
|
| 82 |
bounded_progress = clamp(finite_float(progress, fallback=0.0))
|
| 83 |
+
centered_progress = (bounded_progress - 0.5) * 6.0
|
| 84 |
+
smoothed_progress = final_score_pipeline(centered_progress)
|
| 85 |
+
score = floor + (ceiling - floor) * smoothed_progress
|
| 86 |
+
score = safe_score(score)
|
| 87 |
assert 0 < score < 1, f"Invalid score: {score}"
|
| 88 |
return score
|
| 89 |
|
|
|
|
| 107 |
def component_score(value: Any) -> float:
|
| 108 |
"""Normalize component scores such as syntax, quality, and runtime."""
|
| 109 |
|
| 110 |
+
bounded_value = clamp(finite_float(value, fallback=0.0))
|
| 111 |
+
return shaped_score(bounded_value, floor=0.02, ceiling=0.98)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def composite_progress(
|
| 115 |
+
*,
|
| 116 |
+
correctness: Any = 0.0,
|
| 117 |
+
quality: Any = 0.0,
|
| 118 |
+
runtime: Any = 0.0,
|
| 119 |
+
syntax: Any = 0.0,
|
| 120 |
+
similarity: Any = 0.0,
|
| 121 |
+
baseline: float = 0.05,
|
| 122 |
+
penalty: Any = 0.0,
|
| 123 |
+
) -> float:
|
| 124 |
+
"""Blend multiple progress signals into a stable scalar progress estimate."""
|
| 125 |
+
|
| 126 |
+
progress = (
|
| 127 |
+
finite_float(baseline, fallback=0.05)
|
| 128 |
+
+ 0.45 * clamp(correctness)
|
| 129 |
+
+ 0.20 * clamp(quality)
|
| 130 |
+
+ 0.15 * clamp(runtime)
|
| 131 |
+
+ 0.15 * clamp(syntax)
|
| 132 |
+
+ 0.05 * clamp(similarity)
|
| 133 |
+
- 0.20 * clamp(penalty)
|
| 134 |
+
)
|
| 135 |
+
return clamp(progress)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def composite_grade_score(
|
| 139 |
+
*,
|
| 140 |
+
correctness: Any = 0.0,
|
| 141 |
+
quality: Any = 0.0,
|
| 142 |
+
runtime: Any = 0.0,
|
| 143 |
+
syntax: Any = 0.0,
|
| 144 |
+
similarity: Any = 0.0,
|
| 145 |
+
baseline: float = 0.05,
|
| 146 |
+
penalty: Any = 0.0,
|
| 147 |
+
) -> float:
|
| 148 |
+
"""Create a smooth task score from multiple bounded signals."""
|
| 149 |
+
|
| 150 |
+
progress = composite_progress(
|
| 151 |
+
correctness=correctness,
|
| 152 |
+
quality=quality,
|
| 153 |
+
runtime=runtime,
|
| 154 |
+
syntax=syntax,
|
| 155 |
+
similarity=similarity,
|
| 156 |
+
baseline=baseline,
|
| 157 |
+
penalty=penalty,
|
| 158 |
+
)
|
| 159 |
+
return shaped_score(progress)
|
| 160 |
|
| 161 |
|
| 162 |
def compile_code(code: str) -> tuple[bool, str]:
|
|
|
|
| 194 |
)
|
| 195 |
|
| 196 |
|
| 197 |
+
def run_with_timeout(
|
| 198 |
+
worker: Callable[[Dict[str, Any]], Dict[str, Any]],
|
| 199 |
+
payload: Dict[str, Any],
|
| 200 |
+
timeout_s: float,
|
| 201 |
+
) -> Dict[str, Any]:
|
| 202 |
+
"""Execute a worker in a subprocess and terminate on timeout.
|
| 203 |
+
|
| 204 |
+
Some constrained Windows environments disallow spawned pipes or child
|
| 205 |
+
processes. In those cases, fall back to the inline timeout path so local
|
| 206 |
+
demos and tests still work deterministically.
|
| 207 |
+
"""
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
ctx = mp.get_context("spawn")
|
| 211 |
+
queue = ctx.Queue()
|
| 212 |
+
process = ctx.Process(target=_queue_worker, args=(worker, payload, queue))
|
| 213 |
+
process.start()
|
| 214 |
+
process.join(timeout_s)
|
| 215 |
+
except (PermissionError, OSError):
|
| 216 |
+
return run_inline_with_timeout(worker, payload, timeout_s)
|
| 217 |
+
|
| 218 |
+
if process.is_alive():
|
| 219 |
+
process.terminate()
|
| 220 |
+
process.join()
|
| 221 |
+
return {"timed_out": True, "error": f"Execution exceeded {timeout_s:.1f}s timeout."}
|
| 222 |
|
| 223 |
if queue.empty():
|
| 224 |
return {"timed_out": False, "error": "Worker exited without returning a result."}
|
graders/syntax.py
CHANGED
|
@@ -12,10 +12,10 @@ except ImportError:
|
|
| 12 |
from .shared import (
|
| 13 |
base_grade,
|
| 14 |
compile_code,
|
|
|
|
| 15 |
component_score,
|
| 16 |
execute_cases,
|
| 17 |
quality_metrics,
|
| 18 |
-
shaped_score,
|
| 19 |
similarity_score,
|
| 20 |
summarize_results,
|
| 21 |
)
|
|
@@ -26,6 +26,7 @@ def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> Ta
|
|
| 26 |
|
| 27 |
compiled, compile_error = compile_code(code)
|
| 28 |
quality = quality_metrics(code, task.function_name)
|
|
|
|
| 29 |
details = {
|
| 30 |
"compile_error": compile_error,
|
| 31 |
"quality_notes": quality["quality_notes"],
|
|
@@ -33,11 +34,18 @@ def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> Ta
|
|
| 33 |
}
|
| 34 |
|
| 35 |
if not compiled:
|
| 36 |
-
progress = 0.05 + 0.2 * similarity_score(code, task.reference_code)
|
| 37 |
details["test_results"] = []
|
| 38 |
details["test_summary"] = "Code does not compile yet."
|
| 39 |
return base_grade(
|
| 40 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
syntax_score=component_score(0.01),
|
| 42 |
tests_passed=0,
|
| 43 |
tests_total=len(task.public_cases) + len(task.hidden_cases),
|
|
@@ -52,9 +60,16 @@ def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> Ta
|
|
| 52 |
if result.get("timed_out"):
|
| 53 |
details["test_results"] = []
|
| 54 |
details["test_summary"] = result["error"]
|
| 55 |
-
progress = 0.2 + 0.25 * quality["score"]
|
| 56 |
return base_grade(
|
| 57 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
syntax_score=component_score(0.95),
|
| 59 |
tests_passed=0,
|
| 60 |
tests_total=len(cases),
|
|
@@ -66,9 +81,16 @@ def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> Ta
|
|
| 66 |
if "error" in result:
|
| 67 |
details["test_results"] = []
|
| 68 |
details["test_summary"] = result["error"]
|
| 69 |
-
progress = 0.18 + 0.2 * quality["score"]
|
| 70 |
return base_grade(
|
| 71 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
syntax_score=component_score(0.95),
|
| 73 |
tests_passed=0,
|
| 74 |
tests_total=len(cases),
|
|
@@ -82,9 +104,15 @@ def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> Ta
|
|
| 82 |
details["test_results"] = data["results"]
|
| 83 |
details["test_summary"] = summarize_results("Validation checks", data["results"])
|
| 84 |
pass_rate = data["passed"] / max(data["total"], 1)
|
| 85 |
-
progress = min(1.0, 0.15 + 0.75 * pass_rate + 0.1 * quality["score"])
|
| 86 |
return base_grade(
|
| 87 |
-
score=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
syntax_score=component_score(0.95),
|
| 89 |
tests_passed=data["passed"],
|
| 90 |
tests_total=data["total"],
|
|
|
|
| 12 |
from .shared import (
|
| 13 |
base_grade,
|
| 14 |
compile_code,
|
| 15 |
+
composite_grade_score,
|
| 16 |
component_score,
|
| 17 |
execute_cases,
|
| 18 |
quality_metrics,
|
|
|
|
| 19 |
similarity_score,
|
| 20 |
summarize_results,
|
| 21 |
)
|
|
|
|
| 26 |
|
| 27 |
compiled, compile_error = compile_code(code)
|
| 28 |
quality = quality_metrics(code, task.function_name)
|
| 29 |
+
similarity = similarity_score(code, task.reference_code)
|
| 30 |
details = {
|
| 31 |
"compile_error": compile_error,
|
| 32 |
"quality_notes": quality["quality_notes"],
|
|
|
|
| 34 |
}
|
| 35 |
|
| 36 |
if not compiled:
|
|
|
|
| 37 |
details["test_results"] = []
|
| 38 |
details["test_summary"] = "Code does not compile yet."
|
| 39 |
return base_grade(
|
| 40 |
+
score=composite_grade_score(
|
| 41 |
+
correctness=0.0,
|
| 42 |
+
quality=0.05,
|
| 43 |
+
runtime=0.05,
|
| 44 |
+
syntax=0.0,
|
| 45 |
+
similarity=similarity,
|
| 46 |
+
baseline=0.05,
|
| 47 |
+
penalty=0.05,
|
| 48 |
+
),
|
| 49 |
syntax_score=component_score(0.01),
|
| 50 |
tests_passed=0,
|
| 51 |
tests_total=len(task.public_cases) + len(task.hidden_cases),
|
|
|
|
| 60 |
if result.get("timed_out"):
|
| 61 |
details["test_results"] = []
|
| 62 |
details["test_summary"] = result["error"]
|
|
|
|
| 63 |
return base_grade(
|
| 64 |
+
score=composite_grade_score(
|
| 65 |
+
correctness=0.15,
|
| 66 |
+
quality=quality["score"],
|
| 67 |
+
runtime=0.0,
|
| 68 |
+
syntax=0.95,
|
| 69 |
+
similarity=similarity,
|
| 70 |
+
baseline=0.08,
|
| 71 |
+
penalty=0.12,
|
| 72 |
+
),
|
| 73 |
syntax_score=component_score(0.95),
|
| 74 |
tests_passed=0,
|
| 75 |
tests_total=len(cases),
|
|
|
|
| 81 |
if "error" in result:
|
| 82 |
details["test_results"] = []
|
| 83 |
details["test_summary"] = result["error"]
|
|
|
|
| 84 |
return base_grade(
|
| 85 |
+
score=composite_grade_score(
|
| 86 |
+
correctness=0.18,
|
| 87 |
+
quality=quality["score"],
|
| 88 |
+
runtime=0.0,
|
| 89 |
+
syntax=0.95,
|
| 90 |
+
similarity=similarity,
|
| 91 |
+
baseline=0.08,
|
| 92 |
+
penalty=0.08,
|
| 93 |
+
),
|
| 94 |
syntax_score=component_score(0.95),
|
| 95 |
tests_passed=0,
|
| 96 |
tests_total=len(cases),
|
|
|
|
| 104 |
details["test_results"] = data["results"]
|
| 105 |
details["test_summary"] = summarize_results("Validation checks", data["results"])
|
| 106 |
pass_rate = data["passed"] / max(data["total"], 1)
|
|
|
|
| 107 |
return base_grade(
|
| 108 |
+
score=composite_grade_score(
|
| 109 |
+
correctness=pass_rate,
|
| 110 |
+
quality=quality["score"],
|
| 111 |
+
runtime=0.05,
|
| 112 |
+
syntax=0.95,
|
| 113 |
+
similarity=similarity,
|
| 114 |
+
baseline=0.10,
|
| 115 |
+
),
|
| 116 |
syntax_score=component_score(0.95),
|
| 117 |
tests_passed=data["passed"],
|
| 118 |
tests_total=data["total"],
|
models/pytorch_model.py
CHANGED
|
@@ -1,149 +1,227 @@
|
|
| 1 |
-
"""PyTorch + transformers model wrapper for
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import hashlib
|
| 6 |
-
from typing import Dict, List, Sequence
|
| 7 |
-
|
| 8 |
-
import torch
|
| 9 |
-
import torch.nn.functional as F
|
| 10 |
-
|
| 11 |
-
try:
|
| 12 |
-
from transformers import AutoModel, AutoTokenizer
|
| 13 |
-
except Exception:
|
| 14 |
-
AutoModel = None # type: ignore[assignment]
|
| 15 |
-
AutoTokenizer = None # type: ignore[assignment]
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
DOMAIN_PROTOTYPES: Dict[str, List[str]] = {
|
| 19 |
-
"dsa": [
|
| 20 |
-
"
|
| 21 |
-
"Competitive programming
|
| 22 |
-
],
|
| 23 |
-
"data_science": [
|
| 24 |
-
"Pandas dataframe transformation, numpy vectorization, feature
|
| 25 |
-
"
|
| 26 |
-
],
|
| 27 |
-
"ml_dl": [
|
| 28 |
-
"PyTorch model
|
| 29 |
-
"Machine learning
|
| 30 |
-
],
|
| 31 |
-
"web": [
|
| 32 |
-
"FastAPI endpoint
|
| 33 |
-
"
|
| 34 |
-
],
|
| 35 |
-
"general": [
|
| 36 |
-
"General Python utility code with
|
| 37 |
-
],
|
| 38 |
-
}
|
| 39 |
-
|
| 40 |
-
QUALITY_ANCHORS: Dict[str, List[str]] = {
|
| 41 |
-
"high": [
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
],
|
| 45 |
-
"low": [
|
| 46 |
-
"
|
| 47 |
-
"
|
| 48 |
-
],
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
self.
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PyTorch + transformers model wrapper for code-quality scoring."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import hashlib
|
| 6 |
+
from typing import Dict, List, Sequence
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from transformers import AutoModel, AutoTokenizer
|
| 13 |
+
except Exception:
|
| 14 |
+
AutoModel = None # type: ignore[assignment]
|
| 15 |
+
AutoTokenizer = None # type: ignore[assignment]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
DOMAIN_PROTOTYPES: Dict[str, List[str]] = {
|
| 19 |
+
"dsa": [
|
| 20 |
+
"Algorithmic Python with nested loops, recursion, dynamic programming, maps, and asymptotic analysis.",
|
| 21 |
+
"Competitive programming utility focused on arrays, graphs, search, and runtime complexity.",
|
| 22 |
+
],
|
| 23 |
+
"data_science": [
|
| 24 |
+
"Pandas dataframe transformation, numpy vectorization, feature engineering, data cleaning, and leakage prevention.",
|
| 25 |
+
"Notebook-style data pipeline using joins, aggregations, and columnar operations.",
|
| 26 |
+
],
|
| 27 |
+
"ml_dl": [
|
| 28 |
+
"PyTorch model inference or training loop with eval mode, no_grad, tensors, optimizer, and loss functions.",
|
| 29 |
+
"Machine learning code with torch, sklearn, batches, checkpoints, and metrics.",
|
| 30 |
+
],
|
| 31 |
+
"web": [
|
| 32 |
+
"FastAPI backend endpoint with pydantic validation, dependency injection, request parsing, and API safety.",
|
| 33 |
+
"Python web-service route handling, serialization, authentication, and response contracts.",
|
| 34 |
+
],
|
| 35 |
+
"general": [
|
| 36 |
+
"General Python utility code with readability, typing, small functions, tests, and maintainable abstractions.",
|
| 37 |
+
],
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
QUALITY_ANCHORS: Dict[str, List[str]] = {
|
| 41 |
+
"high": [
|
| 42 |
+
"Production-ready Python code with clear naming, docstrings, validation, efficient loops, and low complexity.",
|
| 43 |
+
"Clean code with explicit error handling, typing, modular design, and testable functions.",
|
| 44 |
+
],
|
| 45 |
+
"low": [
|
| 46 |
+
"Bug-prone Python with nested loops, missing validation, weak naming, duplicated logic, and hard-to-review structure.",
|
| 47 |
+
"Risky code with syntax drift, unclear behavior, mutable side effects, and repeated scans over data.",
|
| 48 |
+
],
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
MAINTAINABILITY_ANCHORS: Dict[str, List[str]] = {
|
| 52 |
+
"high": [
|
| 53 |
+
"Readable functions, small logical units, strong typing, comments only where needed, and simple control flow.",
|
| 54 |
+
"Maintainable Python service with clean architecture, cohesive modules, and explicit contracts.",
|
| 55 |
+
],
|
| 56 |
+
"low": [
|
| 57 |
+
"Large unstructured function, missing docstrings, weak names, deeply nested branches, and difficult debugging.",
|
| 58 |
+
"Hard-to-maintain script with inconsistent style, brittle branching, and hidden side effects.",
|
| 59 |
+
],
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
ISSUE_ANCHORS: Dict[str, List[str]] = {
|
| 63 |
+
"correctness": [
|
| 64 |
+
"Off-by-one bug, missing final append, incorrect boundary handling, failing assertions, wrong return value.",
|
| 65 |
+
"Logic regression caused by a missing branch, incorrect state update, or unhandled edge case.",
|
| 66 |
+
],
|
| 67 |
+
"performance": [
|
| 68 |
+
"Repeated full-list scans, brute-force nested loops, iterrows misuse, avoidable O(n^2) behavior, slow pipeline.",
|
| 69 |
+
"Performance regression from redundant iteration, poor data structures, or missing vectorization.",
|
| 70 |
+
],
|
| 71 |
+
"security": [
|
| 72 |
+
"Unsafe input handling, unchecked request payload, eval usage, missing validation, insecure backend pattern.",
|
| 73 |
+
"Security risk caused by trusting raw user input or bypassing schema validation.",
|
| 74 |
+
],
|
| 75 |
+
"style": [
|
| 76 |
+
"Readability issues from long lines, missing docstrings, inconsistent spacing, tabs, and trailing whitespace.",
|
| 77 |
+
"Style drift that makes code review harder and maintenance slower.",
|
| 78 |
+
],
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class _HashEmbeddingBackend:
|
| 83 |
+
"""Torch-native fallback when pretrained weights cannot be loaded."""
|
| 84 |
+
|
| 85 |
+
def __init__(self, dimensions: int = 128) -> None:
|
| 86 |
+
self.dimensions = dimensions
|
| 87 |
+
self.model_id = "hashed-token-fallback"
|
| 88 |
+
self.backend_name = "hashed-token-fallback"
|
| 89 |
+
self.notes = ["Using hashed embeddings because pretrained transformer weights are unavailable."]
|
| 90 |
+
|
| 91 |
+
def embed_texts(self, texts: Sequence[str]) -> torch.Tensor:
|
| 92 |
+
matrix = torch.zeros((len(texts), self.dimensions), dtype=torch.float32)
|
| 93 |
+
for row_index, text in enumerate(texts):
|
| 94 |
+
tokens = text.lower().split()[:512]
|
| 95 |
+
if not tokens:
|
| 96 |
+
matrix[row_index, 0] = 1.0
|
| 97 |
+
continue
|
| 98 |
+
for token in tokens:
|
| 99 |
+
digest = hashlib.md5(token.encode("utf-8")).hexdigest()
|
| 100 |
+
bucket = int(digest[:8], 16) % self.dimensions
|
| 101 |
+
sign = -1.0 if int(digest[8:10], 16) % 2 else 1.0
|
| 102 |
+
matrix[row_index, bucket] += sign
|
| 103 |
+
return F.normalize(matrix + 1e-6, dim=1)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class PyTorchCodeAnalyzerModel:
|
| 107 |
+
"""Score code using pretrained transformer embeddings plus prototype similarity."""
|
| 108 |
+
|
| 109 |
+
def __init__(self, model_id: str = "huggingface/CodeBERTa-small-v1") -> None:
|
| 110 |
+
self.model_id = model_id
|
| 111 |
+
self.backend_name = model_id
|
| 112 |
+
self.notes: List[str] = []
|
| 113 |
+
self._tokenizer = None
|
| 114 |
+
self._model = None
|
| 115 |
+
self._fallback = _HashEmbeddingBackend()
|
| 116 |
+
self._prototype_cache: Dict[str, torch.Tensor] = {}
|
| 117 |
+
|
| 118 |
+
def _ensure_loaded(self) -> None:
|
| 119 |
+
if self._model is not None or self.notes:
|
| 120 |
+
return
|
| 121 |
+
if AutoTokenizer is None or AutoModel is None:
|
| 122 |
+
self.backend_name = self._fallback.backend_name
|
| 123 |
+
self.notes = list(self._fallback.notes)
|
| 124 |
+
return
|
| 125 |
+
try:
|
| 126 |
+
self._tokenizer = AutoTokenizer.from_pretrained(self.model_id)
|
| 127 |
+
self._model = AutoModel.from_pretrained(self.model_id)
|
| 128 |
+
self._model.eval()
|
| 129 |
+
self.notes.append(f"Loaded pretrained encoder `{self.model_id}`.")
|
| 130 |
+
except Exception as exc:
|
| 131 |
+
self.backend_name = self._fallback.backend_name
|
| 132 |
+
self.notes = list(self._fallback.notes) + [f"Pretrained load failed: {type(exc).__name__}: {exc}"]
|
| 133 |
+
|
| 134 |
+
def _embed_texts(self, texts: Sequence[str]) -> torch.Tensor:
|
| 135 |
+
self._ensure_loaded()
|
| 136 |
+
if self._model is None or self._tokenizer is None:
|
| 137 |
+
return self._fallback.embed_texts(texts)
|
| 138 |
+
encoded = self._tokenizer(list(texts), padding=True, truncation=True, max_length=256, return_tensors="pt")
|
| 139 |
+
with torch.no_grad():
|
| 140 |
+
outputs = self._model(**encoded)
|
| 141 |
+
hidden = outputs.last_hidden_state
|
| 142 |
+
mask = encoded["attention_mask"].unsqueeze(-1)
|
| 143 |
+
pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
|
| 144 |
+
return F.normalize(pooled, dim=1)
|
| 145 |
+
|
| 146 |
+
def _prototype_matrix(self, bucket: str, texts: Sequence[str]) -> torch.Tensor:
|
| 147 |
+
if bucket not in self._prototype_cache:
|
| 148 |
+
self._prototype_cache[bucket] = self._embed_texts(texts)
|
| 149 |
+
return self._prototype_cache[bucket]
|
| 150 |
+
|
| 151 |
+
@staticmethod
|
| 152 |
+
def _unit_similarity(candidate: torch.Tensor, matrix: torch.Tensor) -> float:
|
| 153 |
+
similarity = torch.matmul(candidate, matrix.T).max().item()
|
| 154 |
+
return round((similarity + 1.0) / 2.0, 4)
|
| 155 |
+
|
| 156 |
+
@staticmethod
|
| 157 |
+
def _quality_label(score: float) -> str:
|
| 158 |
+
if score >= 0.82:
|
| 159 |
+
return "excellent"
|
| 160 |
+
if score >= 0.66:
|
| 161 |
+
return "good"
|
| 162 |
+
if score >= 0.45:
|
| 163 |
+
return "needs_work"
|
| 164 |
+
return "risky"
|
| 165 |
+
|
| 166 |
+
def predict(
|
| 167 |
+
self,
|
| 168 |
+
code: str,
|
| 169 |
+
context_window: str,
|
| 170 |
+
traceback_text: str,
|
| 171 |
+
static_summary: Dict[str, object],
|
| 172 |
+
) -> Dict[str, object]:
|
| 173 |
+
"""Predict domain probabilities, quality, and issue risks for Python code."""
|
| 174 |
+
|
| 175 |
+
document = (
|
| 176 |
+
f"Code:\n{code.strip()[:4000]}\n\n"
|
| 177 |
+
f"Context:\n{context_window.strip()[:1000]}\n\n"
|
| 178 |
+
f"Traceback:\n{traceback_text.strip()[:1000]}\n\n"
|
| 179 |
+
f"Static hints:\n{static_summary}\n"
|
| 180 |
+
)
|
| 181 |
+
candidate = self._embed_texts([document])
|
| 182 |
+
|
| 183 |
+
domain_scores: Dict[str, float] = {}
|
| 184 |
+
for domain, texts in DOMAIN_PROTOTYPES.items():
|
| 185 |
+
domain_scores[domain] = self._unit_similarity(candidate, self._prototype_matrix(f"domain:{domain}", texts))
|
| 186 |
+
|
| 187 |
+
high_matrix = self._prototype_matrix("quality:high", QUALITY_ANCHORS["high"])
|
| 188 |
+
low_matrix = self._prototype_matrix("quality:low", QUALITY_ANCHORS["low"])
|
| 189 |
+
high_similarity = torch.matmul(candidate, high_matrix.T).max().item()
|
| 190 |
+
low_similarity = torch.matmul(candidate, low_matrix.T).max().item()
|
| 191 |
+
ml_quality_score = round(float(torch.sigmoid(torch.tensor((high_similarity - low_similarity) * 4.0)).item()), 4)
|
| 192 |
+
|
| 193 |
+
high_maintainability = torch.matmul(
|
| 194 |
+
candidate,
|
| 195 |
+
self._prototype_matrix("maintainability:high", MAINTAINABILITY_ANCHORS["high"]).T,
|
| 196 |
+
).max().item()
|
| 197 |
+
low_maintainability = torch.matmul(
|
| 198 |
+
candidate,
|
| 199 |
+
self._prototype_matrix("maintainability:low", MAINTAINABILITY_ANCHORS["low"]).T,
|
| 200 |
+
).max().item()
|
| 201 |
+
maintainability_score = round(
|
| 202 |
+
float(torch.sigmoid(torch.tensor((high_maintainability - low_maintainability) * 4.0)).item()),
|
| 203 |
+
4,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
issue_logits = []
|
| 207 |
+
issue_labels = list(ISSUE_ANCHORS.keys())
|
| 208 |
+
for label in issue_labels:
|
| 209 |
+
similarity = torch.matmul(candidate, self._prototype_matrix(f"issue:{label}", ISSUE_ANCHORS[label]).T).max().item()
|
| 210 |
+
issue_logits.append(similarity)
|
| 211 |
+
probabilities = torch.softmax(torch.tensor(issue_logits) * 3.0, dim=0)
|
| 212 |
+
issue_probabilities = {
|
| 213 |
+
label: round(float(probabilities[index].item()), 4)
|
| 214 |
+
for index, label in enumerate(issue_labels)
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
return {
|
| 218 |
+
"domain_scores": domain_scores,
|
| 219 |
+
"ml_quality_score": ml_quality_score,
|
| 220 |
+
"quality_score": ml_quality_score,
|
| 221 |
+
"quality_label": self._quality_label(ml_quality_score),
|
| 222 |
+
"maintainability_score": maintainability_score,
|
| 223 |
+
"issue_probabilities": issue_probabilities,
|
| 224 |
+
"backend_name": self.backend_name,
|
| 225 |
+
"model_id": self.model_id,
|
| 226 |
+
"notes": list(self.notes),
|
| 227 |
+
}
|
openenv_python_code_review_env.egg-info/PKG-INFO
CHANGED
|
@@ -6,7 +6,6 @@ Requires-Python: >=3.10
|
|
| 6 |
Description-Content-Type: text/markdown
|
| 7 |
Requires-Dist: fastapi>=0.111.0
|
| 8 |
Requires-Dist: gradio>=5.26.0
|
| 9 |
-
Requires-Dist: hf-xet>=1.4.3
|
| 10 |
Requires-Dist: openai>=1.76.0
|
| 11 |
Requires-Dist: openenv-core[core]>=0.2.2
|
| 12 |
Requires-Dist: streamlit>=1.44.0
|
|
@@ -35,25 +34,26 @@ Production-ready hackathon submission for OpenEnv evaluation, deterministic vali
|
|
| 35 |
|
| 36 |
```text
|
| 37 |
root
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
| 57 |
```
|
| 58 |
|
| 59 |
Runtime flow:
|
|
@@ -71,8 +71,8 @@ inference.py
|
|
| 71 |
|
| 72 |
- `inference.py` now lives at the repo root and delegates to a strict runner under `app/env`.
|
| 73 |
- OpenAI usage is limited to the official Python client:
|
| 74 |
-
`client = OpenAI(base_url=API_BASE_URL, api_key=
|
| 75 |
-
- Defaulted env vars are enforced for `API_BASE_URL` and `MODEL_NAME`; `HF_TOKEN`
|
| 76 |
- Output now matches the required single-line contract exactly and always emits `[END]`, including failure paths.
|
| 77 |
- The RL loop now uses `reset()` plus `step_result()` in a proper `while not done` loop.
|
| 78 |
- Step errors now surface through `last_action_error` and are printed in `[STEP]`.
|
|
@@ -120,7 +120,9 @@ Required environment variables:
|
|
| 120 |
- `MODEL_NAME`
|
| 121 |
Default: `Qwen/Qwen2.5-3B-Instruct`
|
| 122 |
- `HF_TOKEN`
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
|
| 125 |
Example:
|
| 126 |
|
|
@@ -131,6 +133,13 @@ set HF_TOKEN=hf_xxx
|
|
| 131 |
python inference.py
|
| 132 |
```
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
Expected stdout shape:
|
| 135 |
|
| 136 |
```text
|
|
@@ -147,7 +156,7 @@ Expected stdout shape:
|
|
| 147 |
Build from the project root:
|
| 148 |
|
| 149 |
```bash
|
| 150 |
-
docker build -
|
| 151 |
```
|
| 152 |
|
| 153 |
Run locally:
|
|
@@ -173,7 +182,7 @@ Recommended deployment steps:
|
|
| 173 |
|
| 174 |
1. Create a Docker Space.
|
| 175 |
2. Push this repository as-is.
|
| 176 |
-
3. Let Spaces build
|
| 177 |
4. Set Space secrets:
|
| 178 |
`HF_TOKEN`
|
| 179 |
5. Set Space variables as needed:
|
|
|
|
| 6 |
Description-Content-Type: text/markdown
|
| 7 |
Requires-Dist: fastapi>=0.111.0
|
| 8 |
Requires-Dist: gradio>=5.26.0
|
|
|
|
| 9 |
Requires-Dist: openai>=1.76.0
|
| 10 |
Requires-Dist: openenv-core[core]>=0.2.2
|
| 11 |
Requires-Dist: streamlit>=1.44.0
|
|
|
|
| 34 |
|
| 35 |
```text
|
| 36 |
root
|
| 37 |
+
|- inference.py # Root validator entrypoint
|
| 38 |
+
|- openenv.yaml # OpenEnv manifest
|
| 39 |
+
|- app/
|
| 40 |
+
| |- agents/ # Action policy and fallback strategy
|
| 41 |
+
| |- env/ # RL loop runner and stdout contract
|
| 42 |
+
| |- models/ # Inference dataclasses/config
|
| 43 |
+
| |- services/ # OpenAI client wrapper with retries
|
| 44 |
+
| `- utils/ # Formatting, task loading, log suppression
|
| 45 |
+
|- server/
|
| 46 |
+
| |- env.py # OpenEnv environment and reward shaping
|
| 47 |
+
| |- app.py # FastAPI/OpenEnv app, optional Gradio mount
|
| 48 |
+
| `- Dockerfile # Alternate Docker build path
|
| 49 |
+
|- Dockerfile # Root deployment Docker image
|
| 50 |
+
|- graders/ # Syntax, bug-fix, optimization graders
|
| 51 |
+
|- tasks/ # Deterministic benchmark tasks and references
|
| 52 |
+
|- services/ # Multi-domain analysis services
|
| 53 |
+
|- analyzers/ # Domain-specific analyzers
|
| 54 |
+
|- models/ # Lazy-loaded PyTorch scoring model
|
| 55 |
+
|- schemas/ # API request/response contracts
|
| 56 |
+
`- tests/ # Local validation coverage
|
| 57 |
```
|
| 58 |
|
| 59 |
Runtime flow:
|
|
|
|
| 71 |
|
| 72 |
- `inference.py` now lives at the repo root and delegates to a strict runner under `app/env`.
|
| 73 |
- OpenAI usage is limited to the official Python client:
|
| 74 |
+
`client = OpenAI(base_url=API_BASE_URL, api_key=provider_token)`.
|
| 75 |
+
- Defaulted env vars are enforced for `API_BASE_URL` and `MODEL_NAME`; the runtime now selects `HF_TOKEN` for the Hugging Face router and `OPENAI_API_KEY` for direct OpenAI usage.
|
| 76 |
- Output now matches the required single-line contract exactly and always emits `[END]`, including failure paths.
|
| 77 |
- The RL loop now uses `reset()` plus `step_result()` in a proper `while not done` loop.
|
| 78 |
- Step errors now surface through `last_action_error` and are printed in `[STEP]`.
|
|
|
|
| 120 |
- `MODEL_NAME`
|
| 121 |
Default: `Qwen/Qwen2.5-3B-Instruct`
|
| 122 |
- `HF_TOKEN`
|
| 123 |
+
Required for `https://router.huggingface.co/v1`
|
| 124 |
+
- `OPENAI_API_KEY`
|
| 125 |
+
Required for `https://api.openai.com/v1`
|
| 126 |
|
| 127 |
Example:
|
| 128 |
|
|
|
|
| 133 |
python inference.py
|
| 134 |
```
|
| 135 |
|
| 136 |
+
```bash
|
| 137 |
+
set API_BASE_URL=https://api.openai.com/v1
|
| 138 |
+
set MODEL_NAME=gpt-4.1-mini
|
| 139 |
+
set OPENAI_API_KEY=sk-xxx
|
| 140 |
+
python inference.py
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
Expected stdout shape:
|
| 144 |
|
| 145 |
```text
|
|
|
|
| 156 |
Build from the project root:
|
| 157 |
|
| 158 |
```bash
|
| 159 |
+
docker build -t openenv-python-code-review-env .
|
| 160 |
```
|
| 161 |
|
| 162 |
Run locally:
|
|
|
|
| 182 |
|
| 183 |
1. Create a Docker Space.
|
| 184 |
2. Push this repository as-is.
|
| 185 |
+
3. Let Spaces build from the root `Dockerfile`.
|
| 186 |
4. Set Space secrets:
|
| 187 |
`HF_TOKEN`
|
| 188 |
5. Set Space variables as needed:
|
openenv_python_code_review_env.egg-info/SOURCES.txt
CHANGED
|
@@ -1,5 +1,15 @@
|
|
| 1 |
README.md
|
| 2 |
pyproject.toml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
analyzers/__init__.py
|
| 4 |
analyzers/ds_analyzer.py
|
| 5 |
analyzers/dsa_analyzer.py
|
|
@@ -12,6 +22,8 @@ app/examples.py
|
|
| 12 |
app/streamlit_app.py
|
| 13 |
app/agents/__init__.py
|
| 14 |
app/agents/review_agent.py
|
|
|
|
|
|
|
| 15 |
app/models/__init__.py
|
| 16 |
app/models/inference.py
|
| 17 |
app/services/__init__.py
|
|
|
|
| 1 |
README.md
|
| 2 |
pyproject.toml
|
| 3 |
+
./__init__.py
|
| 4 |
+
./client.py
|
| 5 |
+
./compat.py
|
| 6 |
+
./inference.py
|
| 7 |
+
./launch.py
|
| 8 |
+
./models.py
|
| 9 |
+
./sitecustomize.py
|
| 10 |
+
./triage.py
|
| 11 |
+
./triage_catalog.py
|
| 12 |
+
./triage_models.py
|
| 13 |
analyzers/__init__.py
|
| 14 |
analyzers/ds_analyzer.py
|
| 15 |
analyzers/dsa_analyzer.py
|
|
|
|
| 22 |
app/streamlit_app.py
|
| 23 |
app/agents/__init__.py
|
| 24 |
app/agents/review_agent.py
|
| 25 |
+
app/env/__init__.py
|
| 26 |
+
app/env/runner.py
|
| 27 |
app/models/__init__.py
|
| 28 |
app/models/inference.py
|
| 29 |
app/services/__init__.py
|
openenv_python_code_review_env.egg-info/requires.txt
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
fastapi>=0.111.0
|
| 2 |
gradio>=5.26.0
|
| 3 |
-
hf-xet>=1.4.3
|
| 4 |
openai>=1.76.0
|
| 5 |
openenv-core[core]>=0.2.2
|
| 6 |
streamlit>=1.44.0
|
|
|
|
| 1 |
fastapi>=0.111.0
|
| 2 |
gradio>=5.26.0
|
|
|
|
| 3 |
openai>=1.76.0
|
| 4 |
openenv-core[core]>=0.2.2
|
| 5 |
streamlit>=1.44.0
|
openenv_python_code_review_env.egg-info/top_level.txt
CHANGED
|
@@ -1,14 +1 @@
|
|
| 1 |
-
|
| 2 |
-
api
|
| 3 |
-
app
|
| 4 |
-
build
|
| 5 |
-
graders
|
| 6 |
-
models
|
| 7 |
-
outputs
|
| 8 |
-
schemas
|
| 9 |
-
server
|
| 10 |
-
services
|
| 11 |
-
tasks
|
| 12 |
-
tests
|
| 13 |
-
utils
|
| 14 |
-
venv
|
|
|
|
| 1 |
+
python_env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
CHANGED
|
@@ -8,11 +8,9 @@ version = "1.0.0"
|
|
| 8 |
description = "TorchReview Copilot: AI-powered Python code triage with PyTorch and OpenEnv validation."
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.10"
|
| 11 |
-
|
| 12 |
dependencies = [
|
| 13 |
"fastapi>=0.111.0",
|
| 14 |
"gradio>=5.26.0",
|
| 15 |
-
"hf-xet>=1.4.3",
|
| 16 |
"openai>=1.76.0",
|
| 17 |
"openenv-core[core]>=0.2.2",
|
| 18 |
"streamlit>=1.44.0",
|
|
@@ -30,9 +28,27 @@ dev = [
|
|
| 30 |
[project.scripts]
|
| 31 |
server = "python_env.server.app:main"
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
[tool.setuptools]
|
| 34 |
include-package-data = true
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
description = "TorchReview Copilot: AI-powered Python code triage with PyTorch and OpenEnv validation."
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.10"
|
|
|
|
| 11 |
dependencies = [
|
| 12 |
"fastapi>=0.111.0",
|
| 13 |
"gradio>=5.26.0",
|
|
|
|
| 14 |
"openai>=1.76.0",
|
| 15 |
"openenv-core[core]>=0.2.2",
|
| 16 |
"streamlit>=1.44.0",
|
|
|
|
| 28 |
[project.scripts]
|
| 29 |
server = "python_env.server.app:main"
|
| 30 |
|
| 31 |
+
[tool.pytest.ini_options]
|
| 32 |
+
pythonpath = ["."]
|
| 33 |
+
|
| 34 |
[tool.setuptools]
|
| 35 |
include-package-data = true
|
| 36 |
+
packages = [
|
| 37 |
+
"python_env",
|
| 38 |
+
"python_env.server",
|
| 39 |
+
"python_env.tasks",
|
| 40 |
+
"python_env.graders",
|
| 41 |
+
"python_env.api",
|
| 42 |
+
"python_env.app",
|
| 43 |
+
"python_env.app.agents",
|
| 44 |
+
"python_env.app.env",
|
| 45 |
+
"python_env.app.models",
|
| 46 |
+
"python_env.app.services",
|
| 47 |
+
"python_env.app.utils",
|
| 48 |
+
"python_env.analyzers",
|
| 49 |
+
"python_env.models",
|
| 50 |
+
"python_env.schemas",
|
| 51 |
+
"python_env.services",
|
| 52 |
+
"python_env.utils",
|
| 53 |
+
]
|
| 54 |
+
package-dir = { "python_env" = ".", "python_env.server" = "server", "python_env.tasks" = "tasks", "python_env.graders" = "graders", "python_env.api" = "api", "python_env.app" = "app", "python_env.app.agents" = "app/agents", "python_env.app.env" = "app/env", "python_env.app.models" = "app/models", "python_env.app.services" = "app/services", "python_env.app.utils" = "app/utils", "python_env.analyzers" = "analyzers", "python_env.models" = "models", "python_env.schemas" = "schemas", "python_env.services" = "services", "python_env.utils" = "utils" }
|
schemas/request.py
CHANGED
|
@@ -1,19 +1,51 @@
|
|
| 1 |
-
"""Request schemas for
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from typing import Literal
|
| 6 |
-
|
| 7 |
-
from pydantic import BaseModel, Field
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
DomainHint = Literal["auto", "dsa", "data_science", "ml_dl", "web"]
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class AnalyzeCodeRequest(BaseModel):
|
| 14 |
-
"""Validated input payload for
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Request schemas for the AI-powered code review workflow."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Literal
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
DomainHint = Literal["auto", "general", "dsa", "data_science", "ml_dl", "web"]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class AnalyzeCodeRequest(BaseModel):
|
| 14 |
+
"""Validated input payload for Python code review requests."""
|
| 15 |
+
|
| 16 |
+
model_config = ConfigDict(str_strip_whitespace=True)
|
| 17 |
+
|
| 18 |
+
code: str = Field(..., min_length=1, description="Python source code to analyze.")
|
| 19 |
+
context_window: str = Field(
|
| 20 |
+
default="",
|
| 21 |
+
max_length=4000,
|
| 22 |
+
description="Optional repository, pull request, or runtime context.",
|
| 23 |
+
)
|
| 24 |
+
traceback_text: str = Field(
|
| 25 |
+
default="",
|
| 26 |
+
max_length=4000,
|
| 27 |
+
description="Optional traceback or failing test output.",
|
| 28 |
+
)
|
| 29 |
+
domain_hint: DomainHint = Field(
|
| 30 |
+
default="auto",
|
| 31 |
+
description="Optional analysis lens for domain-aware suggestions.",
|
| 32 |
+
)
|
| 33 |
+
filename: str = Field(default="snippet.py", max_length=255, description="Virtual filename for display.")
|
| 34 |
+
enable_suggestions: bool = Field(
|
| 35 |
+
default=True,
|
| 36 |
+
description="Whether the service should return a prioritized improvement plan.",
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
@field_validator("code")
|
| 40 |
+
@classmethod
|
| 41 |
+
def _reject_empty_code(cls, value: str) -> str:
|
| 42 |
+
stripped = value.strip()
|
| 43 |
+
if not stripped:
|
| 44 |
+
raise ValueError("code must not be empty")
|
| 45 |
+
return stripped
|
| 46 |
+
|
| 47 |
+
@field_validator("filename")
|
| 48 |
+
@classmethod
|
| 49 |
+
def _normalize_filename(cls, value: str) -> str:
|
| 50 |
+
candidate = value.strip() or "snippet.py"
|
| 51 |
+
return candidate[:255]
|
schemas/response.py
CHANGED
|
@@ -1,73 +1,109 @@
|
|
| 1 |
-
"""Response schemas for the
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from typing import Dict, List, Literal
|
| 6 |
-
|
| 7 |
-
from pydantic import BaseModel, Field
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Response schemas for the AI-powered code review platform."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Literal
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
Severity = Literal["low", "medium", "high"]
|
| 11 |
+
IssueCategory = Literal["correctness", "maintainability", "performance", "security", "style"]
|
| 12 |
+
QualityLabel = Literal["excellent", "good", "needs_work", "risky"]
|
| 13 |
+
DetectedDomain = Literal["general", "dsa", "data_science", "ml_dl", "web"]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class AnalysisIssue(BaseModel):
|
| 17 |
+
"""One detected issue or risk in the code snippet."""
|
| 18 |
+
|
| 19 |
+
title: str
|
| 20 |
+
category: IssueCategory = "maintainability"
|
| 21 |
+
severity: Severity
|
| 22 |
+
description: str
|
| 23 |
+
line_hint: int | None = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class StaticAnalysisSummary(BaseModel):
|
| 27 |
+
"""Python-specific static-analysis signals."""
|
| 28 |
+
|
| 29 |
+
syntax_valid: bool
|
| 30 |
+
syntax_error: str = ""
|
| 31 |
+
cyclomatic_complexity: int = Field(..., ge=1)
|
| 32 |
+
line_count: int = Field(..., ge=0)
|
| 33 |
+
max_nesting_depth: int = Field(..., ge=0)
|
| 34 |
+
max_loop_depth: int = Field(..., ge=0)
|
| 35 |
+
time_complexity: str = "Unknown"
|
| 36 |
+
space_complexity: str = "Unknown"
|
| 37 |
+
lint_score: float = Field(..., ge=0.0, le=1.0)
|
| 38 |
+
docstring_coverage: float = Field(..., ge=0.0, le=1.0)
|
| 39 |
+
detected_imports: List[str] = Field(default_factory=list)
|
| 40 |
+
code_smells: List[str] = Field(default_factory=list)
|
| 41 |
+
issues: List[AnalysisIssue] = Field(default_factory=list)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class DomainAnalysis(BaseModel):
|
| 45 |
+
"""Domain-aware review signals used for context-specific suggestions."""
|
| 46 |
+
|
| 47 |
+
domain: DetectedDomain
|
| 48 |
+
domain_score: float = Field(..., ge=0.0, le=1.0)
|
| 49 |
+
issues: List[AnalysisIssue] = Field(default_factory=list)
|
| 50 |
+
suggestions: List[str] = Field(default_factory=list)
|
| 51 |
+
highlights: Dict[str, float | str] = Field(default_factory=dict)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class ModelPrediction(BaseModel):
|
| 55 |
+
"""PyTorch model output derived from pretrained code embeddings."""
|
| 56 |
+
|
| 57 |
+
quality_label: QualityLabel
|
| 58 |
+
quality_score: float = Field(..., ge=0.0, le=1.0)
|
| 59 |
+
maintainability_score: float = Field(..., ge=0.0, le=1.0)
|
| 60 |
+
issue_probabilities: Dict[str, float] = Field(default_factory=dict)
|
| 61 |
+
notes: List[str] = Field(default_factory=list)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class ScoreBreakdown(BaseModel):
|
| 65 |
+
"""Reward inputs and the final RL-ready scalar reward."""
|
| 66 |
+
|
| 67 |
+
ml_score: float = Field(..., ge=0.0, le=1.0)
|
| 68 |
+
domain_score: float = Field(..., ge=0.0, le=1.0)
|
| 69 |
+
lint_score: float = Field(..., ge=0.0, le=1.0)
|
| 70 |
+
complexity_penalty: float = Field(..., ge=0.0, le=1.0)
|
| 71 |
+
maintainability_score: float = Field(..., ge=0.0, le=1.0)
|
| 72 |
+
security_score: float = Field(..., ge=0.0, le=1.0)
|
| 73 |
+
readability_score: float = Field(..., ge=0.0, le=1.0)
|
| 74 |
+
quality_signal: float = Field(..., ge=0.0, le=1.0)
|
| 75 |
+
error_reduction_signal: float = Field(..., ge=0.0, le=1.0)
|
| 76 |
+
completion_signal: float = Field(..., ge=0.0, le=1.0)
|
| 77 |
+
reward: float = Field(..., ge=0.0, le=1.0)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class SuggestionItem(BaseModel):
|
| 81 |
+
"""One prioritized improvement suggestion."""
|
| 82 |
+
|
| 83 |
+
priority: Literal["P0", "P1", "P2"]
|
| 84 |
+
title: str
|
| 85 |
+
rationale: str
|
| 86 |
+
action: str
|
| 87 |
+
category: IssueCategory
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class AnalyzeCodeResponse(BaseModel):
|
| 91 |
+
"""Top-level structured output for API and UI consumers."""
|
| 92 |
+
|
| 93 |
+
language: Literal["python"] = "python"
|
| 94 |
+
detected_domain: DetectedDomain
|
| 95 |
+
domain_confidences: Dict[str, float] = Field(default_factory=dict)
|
| 96 |
+
score_breakdown: ScoreBreakdown
|
| 97 |
+
static_analysis: StaticAnalysisSummary
|
| 98 |
+
model_prediction: ModelPrediction
|
| 99 |
+
domain_analysis: DomainAnalysis
|
| 100 |
+
suggestions: List[SuggestionItem] = Field(default_factory=list)
|
| 101 |
+
improvement_plan: List[str] = Field(default_factory=list)
|
| 102 |
+
auto_fix_preview: List[str] = Field(default_factory=list)
|
| 103 |
+
score_visualization: Dict[str, float] = Field(default_factory=dict)
|
| 104 |
+
model_backend: str
|
| 105 |
+
model_id: str
|
| 106 |
+
summary: str
|
| 107 |
+
context_window: str = ""
|
| 108 |
+
filename: str = "snippet.py"
|
| 109 |
+
analysis_time_ms: float = Field(..., ge=0.0)
|
server/Dockerfile
CHANGED
|
@@ -6,7 +6,8 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
| 6 |
PYTHONIOENCODING=utf-8 \
|
| 7 |
PIP_NO_CACHE_DIR=1 \
|
| 8 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 9 |
-
ENABLE_GRADIO_DEMO=false
|
|
|
|
| 10 |
|
| 11 |
WORKDIR /app
|
| 12 |
|
|
@@ -24,4 +25,4 @@ EXPOSE 8000
|
|
| 24 |
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 25 |
CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()"
|
| 26 |
|
| 27 |
-
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"
|
|
|
|
| 6 |
PYTHONIOENCODING=utf-8 \
|
| 7 |
PIP_NO_CACHE_DIR=1 \
|
| 8 |
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 9 |
+
ENABLE_GRADIO_DEMO=false \
|
| 10 |
+
ENABLE_WEB_INTERFACE=false
|
| 11 |
|
| 12 |
WORKDIR /app
|
| 13 |
|
|
|
|
| 25 |
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 26 |
CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()"
|
| 27 |
|
| 28 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
server/app.py
CHANGED
|
@@ -18,12 +18,12 @@ try:
|
|
| 18 |
except Exception:
|
| 19 |
gr = None # type: ignore[assignment]
|
| 20 |
|
| 21 |
-
try:
|
| 22 |
-
from ..models import PythonCodeReviewAction, PythonCodeReviewObservation
|
| 23 |
-
from .env import PythonCodeReviewEnvironment
|
| 24 |
-
except ImportError:
|
| 25 |
-
from models import PythonCodeReviewAction, PythonCodeReviewObservation
|
| 26 |
-
from server.env import PythonCodeReviewEnvironment
|
| 27 |
|
| 28 |
|
| 29 |
def _gradio_enabled() -> bool:
|
|
@@ -40,7 +40,7 @@ def _max_concurrent_envs() -> int:
|
|
| 40 |
return 2
|
| 41 |
|
| 42 |
|
| 43 |
-
def build_application():
|
| 44 |
"""Compose the OpenEnv API with the Gradio demo frontend."""
|
| 45 |
|
| 46 |
api_app = create_app(
|
|
@@ -50,19 +50,13 @@ def build_application():
|
|
| 50 |
env_name="python_code_review_env",
|
| 51 |
max_concurrent_envs=_max_concurrent_envs(),
|
| 52 |
)
|
| 53 |
-
served_app = api_app
|
| 54 |
-
if gr is not None and _gradio_enabled():
|
| 55 |
-
try:
|
| 56 |
-
from .demo import
|
| 57 |
-
except ImportError:
|
| 58 |
-
from server.demo import
|
| 59 |
-
served_app = gr.mount_gradio_app(
|
| 60 |
-
api_app,
|
| 61 |
-
build_demo(),
|
| 62 |
-
path="/",
|
| 63 |
-
theme=gr.themes.Soft(primary_hue="orange", secondary_hue="amber"),
|
| 64 |
-
css=CSS,
|
| 65 |
-
)
|
| 66 |
|
| 67 |
wrapper_app = FastAPI(title="python_code_review_env", version="1.0.0")
|
| 68 |
|
|
@@ -77,10 +71,10 @@ def build_application():
|
|
| 77 |
app = build_application()
|
| 78 |
|
| 79 |
|
| 80 |
-
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 81 |
-
import uvicorn
|
| 82 |
-
|
| 83 |
-
uvicorn.run(app, host=host, port=port
|
| 84 |
|
| 85 |
|
| 86 |
if __name__ == "__main__":
|
|
|
|
| 18 |
except Exception:
|
| 19 |
gr = None # type: ignore[assignment]
|
| 20 |
|
| 21 |
+
try:
|
| 22 |
+
from ..models import PythonCodeReviewAction, PythonCodeReviewObservation
|
| 23 |
+
from .env import PythonCodeReviewEnvironment
|
| 24 |
+
except ImportError:
|
| 25 |
+
from models import PythonCodeReviewAction, PythonCodeReviewObservation
|
| 26 |
+
from server.env import PythonCodeReviewEnvironment
|
| 27 |
|
| 28 |
|
| 29 |
def _gradio_enabled() -> bool:
|
|
|
|
| 40 |
return 2
|
| 41 |
|
| 42 |
|
| 43 |
+
def build_application():
|
| 44 |
"""Compose the OpenEnv API with the Gradio demo frontend."""
|
| 45 |
|
| 46 |
api_app = create_app(
|
|
|
|
| 50 |
env_name="python_code_review_env",
|
| 51 |
max_concurrent_envs=_max_concurrent_envs(),
|
| 52 |
)
|
| 53 |
+
served_app = api_app
|
| 54 |
+
if gr is not None and _gradio_enabled():
|
| 55 |
+
try:
|
| 56 |
+
from .demo import build_demo
|
| 57 |
+
except ImportError:
|
| 58 |
+
from server.demo import build_demo
|
| 59 |
+
served_app = gr.mount_gradio_app(api_app, build_demo(), path="/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
wrapper_app = FastAPI(title="python_code_review_env", version="1.0.0")
|
| 62 |
|
|
|
|
| 71 |
app = build_application()
|
| 72 |
|
| 73 |
|
| 74 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 75 |
+
import uvicorn
|
| 76 |
+
|
| 77 |
+
uvicorn.run(app, host=host, port=port)
|
| 78 |
|
| 79 |
|
| 80 |
if __name__ == "__main__":
|
server/demo.py
CHANGED
|
@@ -347,7 +347,7 @@ def build_demo() -> gr.Blocks:
|
|
| 347 |
examples = get_default_engine().example_map()
|
| 348 |
first_example = next(iter(examples.values()))
|
| 349 |
|
| 350 |
-
with gr.Blocks(title="TorchReview Copilot") as demo:
|
| 351 |
gr.HTML(
|
| 352 |
"""
|
| 353 |
<div class="hero-card">
|
|
|
|
| 347 |
examples = get_default_engine().example_map()
|
| 348 |
first_example = next(iter(examples.values()))
|
| 349 |
|
| 350 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="orange", secondary_hue="amber"), css=CSS, title="TorchReview Copilot") as demo:
|
| 351 |
gr.HTML(
|
| 352 |
"""
|
| 353 |
<div class="hero-card">
|
server/env.py
CHANGED
|
@@ -10,7 +10,7 @@ from openenv.core.env_server.types import EnvironmentMetadata
|
|
| 10 |
|
| 11 |
try:
|
| 12 |
from ..graders import grade_task
|
| 13 |
-
from ..graders.shared import component_score, safe_ratio,
|
| 14 |
from ..models import (
|
| 15 |
HistoryEntry,
|
| 16 |
PythonCodeReviewAction,
|
|
@@ -22,7 +22,7 @@ try:
|
|
| 22 |
from ..tasks import ReviewTask, list_tasks, select_task
|
| 23 |
except ImportError:
|
| 24 |
from graders import grade_task
|
| 25 |
-
from graders.shared import component_score, safe_ratio,
|
| 26 |
from models import (
|
| 27 |
HistoryEntry,
|
| 28 |
PythonCodeReviewAction,
|
|
@@ -46,7 +46,7 @@ def _empty_grade() -> TaskGrade:
|
|
| 46 |
|
| 47 |
|
| 48 |
def _reward_value(value: float) -> float:
|
| 49 |
-
return
|
| 50 |
|
| 51 |
|
| 52 |
class PythonCodeReviewEnvironment(
|
|
@@ -300,36 +300,45 @@ class PythonCodeReviewEnvironment(
|
|
| 300 |
) -> RewardDetails:
|
| 301 |
prev_score = previous_grade.score
|
| 302 |
curr_score = current_grade.score
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
prev_rate = safe_ratio(previous_grade.tests_passed, previous_grade.tests_total)
|
| 304 |
curr_rate = safe_ratio(current_grade.tests_passed, current_grade.tests_total)
|
| 305 |
prev_runtime = previous_grade.runtime_score
|
| 306 |
curr_runtime = current_grade.runtime_score
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
syntax_reward =
|
| 311 |
-
test_reward =
|
| 312 |
-
progress_delta =
|
| 313 |
-
quality_bonus =
|
| 314 |
-
runtime_bonus =
|
| 315 |
-
error_reduction_bonus =
|
| 316 |
-
completion_bonus = 0.
|
| 317 |
-
correctness_bonus =
|
| 318 |
-
|
| 319 |
-
invalid_action_penalty =
|
| 320 |
-
timeout_penalty =
|
| 321 |
-
regression_penalty =
|
| 322 |
-
stagnation_penalty =
|
| 323 |
|
| 324 |
raw_value = (
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
+ syntax_reward
|
| 327 |
+ test_reward
|
| 328 |
+ progress_delta
|
| 329 |
+ quality_bonus
|
|
|
|
| 330 |
+ error_reduction_bonus
|
| 331 |
+ completion_bonus
|
| 332 |
-
+ runtime_bonus
|
| 333 |
+ correctness_bonus
|
| 334 |
- invalid_action_penalty
|
| 335 |
- timeout_penalty
|
|
@@ -367,22 +376,22 @@ class PythonCodeReviewEnvironment(
|
|
| 367 |
reason_parts.append("no meaningful state change")
|
| 368 |
|
| 369 |
return RewardDetails(
|
| 370 |
-
value=value,
|
| 371 |
-
syntax_reward=syntax_reward,
|
| 372 |
-
test_reward=test_reward,
|
| 373 |
-
correctness_bonus=correctness_bonus,
|
| 374 |
-
quality_bonus=quality_bonus,
|
| 375 |
-
error_reduction_bonus=error_reduction_bonus,
|
| 376 |
-
completion_bonus=completion_bonus,
|
| 377 |
-
runtime_bonus=runtime_bonus,
|
| 378 |
-
progress_delta=progress_delta,
|
| 379 |
-
invalid_action_penalty=invalid_action_penalty,
|
| 380 |
-
timeout_penalty=timeout_penalty,
|
| 381 |
-
regression_penalty=regression_penalty,
|
| 382 |
-
stagnation_penalty=stagnation_penalty,
|
| 383 |
reason=", ".join(reason_parts),
|
| 384 |
-
prev_score=prev_score,
|
| 385 |
-
curr_score=curr_score,
|
| 386 |
code_changed=code_changed,
|
| 387 |
)
|
| 388 |
|
|
|
|
| 10 |
|
| 11 |
try:
|
| 12 |
from ..graders import grade_task
|
| 13 |
+
from ..graders.shared import component_score, final_score_pipeline, safe_ratio, safe_score
|
| 14 |
from ..models import (
|
| 15 |
HistoryEntry,
|
| 16 |
PythonCodeReviewAction,
|
|
|
|
| 22 |
from ..tasks import ReviewTask, list_tasks, select_task
|
| 23 |
except ImportError:
|
| 24 |
from graders import grade_task
|
| 25 |
+
from graders.shared import component_score, final_score_pipeline, safe_ratio, safe_score
|
| 26 |
from models import (
|
| 27 |
HistoryEntry,
|
| 28 |
PythonCodeReviewAction,
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def _reward_value(value: float) -> float:
|
| 49 |
+
return final_score_pipeline(value)
|
| 50 |
|
| 51 |
|
| 52 |
class PythonCodeReviewEnvironment(
|
|
|
|
| 300 |
) -> RewardDetails:
|
| 301 |
prev_score = previous_grade.score
|
| 302 |
curr_score = current_grade.score
|
| 303 |
+
prev_syntax = previous_grade.syntax_score
|
| 304 |
+
curr_syntax = current_grade.syntax_score
|
| 305 |
+
prev_quality = previous_grade.quality_score
|
| 306 |
+
curr_quality = current_grade.quality_score
|
| 307 |
prev_rate = safe_ratio(previous_grade.tests_passed, previous_grade.tests_total)
|
| 308 |
curr_rate = safe_ratio(current_grade.tests_passed, current_grade.tests_total)
|
| 309 |
prev_runtime = previous_grade.runtime_score
|
| 310 |
curr_runtime = current_grade.runtime_score
|
| 311 |
+
prev_compile_health = 0.1 if str(previous_grade.details.get("compile_error", "")).strip() else 0.95
|
| 312 |
+
curr_compile_health = 0.1 if str(current_grade.details.get("compile_error", "")).strip() else 0.95
|
| 313 |
+
|
| 314 |
+
syntax_reward = max(curr_syntax - prev_syntax, 0.0) * 0.18
|
| 315 |
+
test_reward = max(curr_rate - prev_rate, 0.0) * 0.22
|
| 316 |
+
progress_delta = max(curr_score - prev_score, 0.0) * 0.24
|
| 317 |
+
quality_bonus = max(curr_quality - prev_quality, 0.0) * 0.12
|
| 318 |
+
runtime_bonus = max(curr_runtime - prev_runtime, 0.0) * 0.10
|
| 319 |
+
error_reduction_bonus = max(curr_compile_health - prev_compile_health, 0.0) * 0.14
|
| 320 |
+
completion_bonus = (0.04 + 0.10 * curr_rate) * float(final_submission)
|
| 321 |
+
correctness_bonus = max(curr_score - 0.5, 0.0) * 0.12 * float(final_submission)
|
| 322 |
+
|
| 323 |
+
invalid_action_penalty = (0.04 + (0.08 * (1.0 - prev_score))) if invalid_action else 0.0
|
| 324 |
+
timeout_penalty = (0.05 + (0.06 * max(curr_runtime, prev_runtime))) if timed_out else 0.0
|
| 325 |
+
regression_penalty = max(prev_score - curr_score, 0.0) * 0.24
|
| 326 |
+
stagnation_penalty = (0.02 + (0.04 * prev_score)) if action.action_type == "edit_code" and not code_changed else 0.0
|
| 327 |
|
| 328 |
raw_value = (
|
| 329 |
+
2.0 * (curr_score - 0.5)
|
| 330 |
+
+ 1.2 * (curr_rate - prev_rate)
|
| 331 |
+
+ 0.8 * (curr_quality - prev_quality)
|
| 332 |
+
+ 0.7 * (curr_runtime - prev_runtime)
|
| 333 |
+
+ 0.9 * (curr_syntax - prev_syntax)
|
| 334 |
+
+ 0.6 * (curr_compile_health - prev_compile_health)
|
| 335 |
+ syntax_reward
|
| 336 |
+ test_reward
|
| 337 |
+ progress_delta
|
| 338 |
+ quality_bonus
|
| 339 |
+
+ runtime_bonus
|
| 340 |
+ error_reduction_bonus
|
| 341 |
+ completion_bonus
|
|
|
|
| 342 |
+ correctness_bonus
|
| 343 |
- invalid_action_penalty
|
| 344 |
- timeout_penalty
|
|
|
|
| 376 |
reason_parts.append("no meaningful state change")
|
| 377 |
|
| 378 |
return RewardDetails(
|
| 379 |
+
value=safe_score(value),
|
| 380 |
+
syntax_reward=round(syntax_reward, 6),
|
| 381 |
+
test_reward=round(test_reward, 6),
|
| 382 |
+
correctness_bonus=round(correctness_bonus, 6),
|
| 383 |
+
quality_bonus=round(quality_bonus, 6),
|
| 384 |
+
error_reduction_bonus=round(error_reduction_bonus, 6),
|
| 385 |
+
completion_bonus=round(completion_bonus, 6),
|
| 386 |
+
runtime_bonus=round(runtime_bonus, 6),
|
| 387 |
+
progress_delta=round(progress_delta, 6),
|
| 388 |
+
invalid_action_penalty=round(invalid_action_penalty, 6),
|
| 389 |
+
timeout_penalty=round(timeout_penalty, 6),
|
| 390 |
+
regression_penalty=round(regression_penalty, 6),
|
| 391 |
+
stagnation_penalty=round(stagnation_penalty, 6),
|
| 392 |
reason=", ".join(reason_parts),
|
| 393 |
+
prev_score=safe_score(prev_score),
|
| 394 |
+
curr_score=safe_score(curr_score),
|
| 395 |
code_changed=code_changed,
|
| 396 |
)
|
| 397 |
|
server/requirements.txt
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
-
openenv-core[core]>=0.2.2
|
| 2 |
-
fastapi>=0.111.0
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
torch>=2.2.0
|
| 8 |
-
transformers>=4.45.0
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
fastapi>=0.111.0
|
| 3 |
+
uvicorn>=0.30.0
|
| 4 |
+
openai>=1.76.0
|
| 5 |
+
torch>=2.2.0
|
| 6 |
+
transformers>=4.45.0
|
|
|
|
|
|
services/analysis_service.py
CHANGED
|
@@ -1,139 +1,258 @@
|
|
| 1 |
-
"""Orchestration layer for
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import time
|
| 6 |
-
from typing import Any, Callable
|
| 7 |
-
|
| 8 |
-
from analyzers import analyze_data_science_code, analyze_dsa_code, analyze_ml_code, analyze_web_code
|
| 9 |
-
from models import PyTorchCodeAnalyzerModel
|
| 10 |
-
from schemas.request import AnalyzeCodeRequest
|
| 11 |
-
from schemas.response import
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Orchestration layer for AI-powered Python code review."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import time
|
| 6 |
+
from typing import Any, Callable
|
| 7 |
+
|
| 8 |
+
from analyzers import analyze_data_science_code, analyze_dsa_code, analyze_ml_code, analyze_web_code
|
| 9 |
+
from models import PyTorchCodeAnalyzerModel
|
| 10 |
+
from schemas.request import AnalyzeCodeRequest
|
| 11 |
+
from schemas.response import (
|
| 12 |
+
AnalysisIssue,
|
| 13 |
+
AnalyzeCodeResponse,
|
| 14 |
+
DomainAnalysis,
|
| 15 |
+
ModelPrediction,
|
| 16 |
+
StaticAnalysisSummary,
|
| 17 |
+
)
|
| 18 |
+
from services.reward_service import RewardService
|
| 19 |
+
from services.suggestion_service import SuggestionService
|
| 20 |
+
from utils import estimate_complexity, parse_code_structure
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _clamp_unit(value: float) -> float:
|
| 24 |
+
return max(0.0, min(1.0, float(value)))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _lint_score(parsed: dict[str, Any]) -> float:
|
| 28 |
+
"""Convert structural smells into a normalized lint-style score."""
|
| 29 |
+
|
| 30 |
+
score = 1.0
|
| 31 |
+
if not parsed.get("syntax_valid", True):
|
| 32 |
+
score -= 0.45
|
| 33 |
+
score -= min(int(parsed.get("long_lines", 0) or 0), 5) * 0.03
|
| 34 |
+
if parsed.get("tabs_used"):
|
| 35 |
+
score -= 0.1
|
| 36 |
+
if parsed.get("trailing_whitespace_lines"):
|
| 37 |
+
score -= 0.05
|
| 38 |
+
if parsed.get("docstring_ratio", 0.0) == 0.0 and parsed.get("function_names"):
|
| 39 |
+
score -= 0.08
|
| 40 |
+
return round(_clamp_unit(score), 4)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _static_issues(parsed: dict[str, Any], complexity: dict[str, Any]) -> list[AnalysisIssue]:
|
| 44 |
+
"""Turn parser and complexity heuristics into review issues."""
|
| 45 |
+
|
| 46 |
+
issues: list[AnalysisIssue] = []
|
| 47 |
+
if not parsed.get("syntax_valid", True):
|
| 48 |
+
issues.append(
|
| 49 |
+
AnalysisIssue(
|
| 50 |
+
title="Syntax error blocks execution",
|
| 51 |
+
category="correctness",
|
| 52 |
+
severity="high",
|
| 53 |
+
description=str(parsed.get("syntax_error", "Python failed to parse the snippet.")),
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
if int(parsed.get("max_loop_depth", 0) or 0) >= 2:
|
| 57 |
+
issues.append(
|
| 58 |
+
AnalysisIssue(
|
| 59 |
+
title="Nested loops increase runtime risk",
|
| 60 |
+
category="performance",
|
| 61 |
+
severity="medium",
|
| 62 |
+
description="The current control flow suggests a brute-force path that may not scale on larger inputs.",
|
| 63 |
+
)
|
| 64 |
+
)
|
| 65 |
+
if int(complexity.get("cyclomatic_complexity", 1) or 1) >= 7:
|
| 66 |
+
issues.append(
|
| 67 |
+
AnalysisIssue(
|
| 68 |
+
title="Cyclomatic complexity is elevated",
|
| 69 |
+
category="maintainability",
|
| 70 |
+
severity="medium",
|
| 71 |
+
description="Branch-heavy code is harder to review, test, and optimize confidently.",
|
| 72 |
+
)
|
| 73 |
+
)
|
| 74 |
+
if parsed.get("docstring_ratio", 0.0) == 0.0 and parsed.get("function_names"):
|
| 75 |
+
issues.append(
|
| 76 |
+
AnalysisIssue(
|
| 77 |
+
title="Missing public-function documentation",
|
| 78 |
+
category="style",
|
| 79 |
+
severity="low",
|
| 80 |
+
description="Short docstrings would make the expected contract and edge cases easier to review.",
|
| 81 |
+
)
|
| 82 |
+
)
|
| 83 |
+
return issues
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class AnalysisService:
|
| 87 |
+
"""End-to-end analysis pipeline shared by API and UI."""
|
| 88 |
+
|
| 89 |
+
def __init__(self) -> None:
|
| 90 |
+
self._model: PyTorchCodeAnalyzerModel | None = None
|
| 91 |
+
self.reward_service = RewardService()
|
| 92 |
+
self.suggestion_service = SuggestionService()
|
| 93 |
+
self._analyzers: dict[str, Callable[[str, dict[str, Any], dict[str, Any]], DomainAnalysis]] = {
|
| 94 |
+
"dsa": analyze_dsa_code,
|
| 95 |
+
"data_science": analyze_data_science_code,
|
| 96 |
+
"ml_dl": analyze_ml_code,
|
| 97 |
+
"web": analyze_web_code,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
@property
|
| 101 |
+
def model(self) -> PyTorchCodeAnalyzerModel:
|
| 102 |
+
if self._model is None:
|
| 103 |
+
self._model = PyTorchCodeAnalyzerModel()
|
| 104 |
+
return self._model
|
| 105 |
+
|
| 106 |
+
def _heuristic_domain_scores(self, parsed: dict[str, Any], code: str) -> dict[str, float]:
|
| 107 |
+
"""Derive domain priors from imports and syntax-level hints."""
|
| 108 |
+
|
| 109 |
+
scores = {
|
| 110 |
+
"dsa": 0.22
|
| 111 |
+
+ (0.18 if parsed.get("uses_recursion") else 0.0)
|
| 112 |
+
+ (0.18 if int(parsed.get("max_loop_depth", 0) or 0) >= 1 else 0.0),
|
| 113 |
+
"data_science": 0.22 + (0.38 if parsed.get("uses_pandas") or parsed.get("uses_numpy") else 0.0),
|
| 114 |
+
"ml_dl": 0.22 + (0.38 if parsed.get("uses_torch") or parsed.get("uses_sklearn") else 0.0),
|
| 115 |
+
"web": 0.22
|
| 116 |
+
+ (0.38 if parsed.get("uses_fastapi") or parsed.get("uses_flask") else 0.0)
|
| 117 |
+
+ (0.12 if parsed.get("route_decorators") else 0.0),
|
| 118 |
+
"general": 0.26,
|
| 119 |
+
}
|
| 120 |
+
lowered = code.lower()
|
| 121 |
+
if "fastapi" in lowered:
|
| 122 |
+
scores["web"] += 0.12
|
| 123 |
+
if "pandas" in lowered or "numpy" in lowered:
|
| 124 |
+
scores["data_science"] += 0.1
|
| 125 |
+
if "torch" in lowered or "sklearn" in lowered:
|
| 126 |
+
scores["ml_dl"] += 0.1
|
| 127 |
+
if "while" in code or "for" in code:
|
| 128 |
+
scores["dsa"] += 0.06
|
| 129 |
+
return {key: round(min(value, 0.99), 4) for key, value in scores.items()}
|
| 130 |
+
|
| 131 |
+
def _general_domain_analysis(self, parsed: dict[str, Any], complexity: dict[str, Any]) -> DomainAnalysis:
|
| 132 |
+
"""Fallback analysis when no specialized domain is strongly selected."""
|
| 133 |
+
|
| 134 |
+
suggestions = [
|
| 135 |
+
"Keep functions small, validate inputs explicitly, and add focused tests for edge cases.",
|
| 136 |
+
]
|
| 137 |
+
if int(parsed.get("max_loop_depth", 0) or 0) >= 2:
|
| 138 |
+
suggestions.append("Consider replacing repeated scans with a precomputed dictionary or set.")
|
| 139 |
+
return DomainAnalysis(
|
| 140 |
+
domain="general",
|
| 141 |
+
domain_score=round(_clamp_unit(0.62 - (0.12 * float(complexity["complexity_penalty"]))), 4),
|
| 142 |
+
issues=_static_issues(parsed, complexity)[:2],
|
| 143 |
+
suggestions=suggestions,
|
| 144 |
+
highlights={
|
| 145 |
+
"cyclomatic_complexity": float(complexity["cyclomatic_complexity"]),
|
| 146 |
+
"max_loop_depth": float(parsed.get("max_loop_depth", 0) or 0),
|
| 147 |
+
"lint_score": float(_lint_score(parsed)),
|
| 148 |
+
},
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
def analyze(self, request: AnalyzeCodeRequest) -> AnalyzeCodeResponse:
|
| 152 |
+
"""Run the complete static-plus-ML code review pipeline."""
|
| 153 |
+
|
| 154 |
+
started = time.perf_counter()
|
| 155 |
+
parsed = parse_code_structure(request.code)
|
| 156 |
+
complexity = estimate_complexity(parsed, request.code)
|
| 157 |
+
lint_score = _lint_score(parsed)
|
| 158 |
+
model_prediction = self.model.predict(
|
| 159 |
+
request.code,
|
| 160 |
+
request.context_window,
|
| 161 |
+
request.traceback_text,
|
| 162 |
+
parsed,
|
| 163 |
+
)
|
| 164 |
+
heuristic_scores = self._heuristic_domain_scores(parsed, request.code)
|
| 165 |
+
|
| 166 |
+
combined_scores: dict[str, float] = {}
|
| 167 |
+
for domain, heuristic_score in heuristic_scores.items():
|
| 168 |
+
model_score = float(model_prediction["domain_scores"].get(domain, 0.2))
|
| 169 |
+
combined_scores[domain] = round((0.65 * model_score) + (0.35 * heuristic_score), 4)
|
| 170 |
+
|
| 171 |
+
detected_domain = request.domain_hint if request.domain_hint != "auto" else max(combined_scores, key=combined_scores.get)
|
| 172 |
+
analyzer = self._analyzers.get(detected_domain)
|
| 173 |
+
domain_analysis = (
|
| 174 |
+
analyzer(request.code, parsed, complexity)
|
| 175 |
+
if analyzer is not None
|
| 176 |
+
else self._general_domain_analysis(parsed, complexity)
|
| 177 |
+
)
|
| 178 |
+
static_issues = _static_issues(parsed, complexity)
|
| 179 |
+
static_analysis = StaticAnalysisSummary(
|
| 180 |
+
syntax_valid=bool(parsed["syntax_valid"]),
|
| 181 |
+
syntax_error=str(parsed["syntax_error"]),
|
| 182 |
+
cyclomatic_complexity=int(complexity["cyclomatic_complexity"]),
|
| 183 |
+
line_count=int(parsed["line_count"]),
|
| 184 |
+
max_nesting_depth=int(parsed["max_nesting_depth"]),
|
| 185 |
+
max_loop_depth=int(parsed["max_loop_depth"]),
|
| 186 |
+
time_complexity=str(complexity["time_complexity"]),
|
| 187 |
+
space_complexity=str(complexity["space_complexity"]),
|
| 188 |
+
lint_score=lint_score,
|
| 189 |
+
docstring_coverage=float(parsed["docstring_ratio"]),
|
| 190 |
+
detected_imports=list(parsed["imports"]),
|
| 191 |
+
code_smells=list(parsed["code_smells"]),
|
| 192 |
+
issues=static_issues,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
score_breakdown = self.reward_service.compute(
|
| 196 |
+
ml_score=float(model_prediction["ml_quality_score"]),
|
| 197 |
+
domain_score=domain_analysis.domain_score,
|
| 198 |
+
lint_score=lint_score,
|
| 199 |
+
complexity_penalty=float(complexity["complexity_penalty"]),
|
| 200 |
+
maintainability_score=float(model_prediction["maintainability_score"]),
|
| 201 |
+
issue_probabilities=dict(model_prediction["issue_probabilities"]),
|
| 202 |
+
)
|
| 203 |
+
suggestions = self.suggestion_service.build_suggestions(
|
| 204 |
+
domain_analysis=domain_analysis,
|
| 205 |
+
static_analysis=static_analysis,
|
| 206 |
+
)
|
| 207 |
+
improvement_plan = self.suggestion_service.build_improvement_plan(
|
| 208 |
+
domain_analysis=domain_analysis,
|
| 209 |
+
static_analysis=static_analysis,
|
| 210 |
+
)
|
| 211 |
+
auto_fix_preview = self.suggestion_service.build_auto_fix_preview(
|
| 212 |
+
domain_analysis=domain_analysis,
|
| 213 |
+
static_analysis=static_analysis,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
summary = (
|
| 217 |
+
f"Reviewed Python code as `{detected_domain}` with an ML quality score of {score_breakdown.ml_score:.0%}, "
|
| 218 |
+
f"lint score {score_breakdown.lint_score:.0%}, and RL-ready reward {score_breakdown.reward:.0%}."
|
| 219 |
+
)
|
| 220 |
+
model_notes = list(model_prediction["notes"])
|
| 221 |
+
if static_issues:
|
| 222 |
+
model_notes.append(f"Static analyzer found {len(static_issues)} review issue(s).")
|
| 223 |
+
|
| 224 |
+
return AnalyzeCodeResponse(
|
| 225 |
+
detected_domain=detected_domain, # type: ignore[arg-type]
|
| 226 |
+
domain_confidences=combined_scores,
|
| 227 |
+
score_breakdown=score_breakdown,
|
| 228 |
+
static_analysis=static_analysis,
|
| 229 |
+
model_prediction=ModelPrediction(
|
| 230 |
+
quality_label=str(model_prediction["quality_label"]), # type: ignore[arg-type]
|
| 231 |
+
quality_score=float(model_prediction["quality_score"]),
|
| 232 |
+
maintainability_score=float(model_prediction["maintainability_score"]),
|
| 233 |
+
issue_probabilities=dict(model_prediction["issue_probabilities"]),
|
| 234 |
+
notes=model_notes,
|
| 235 |
+
),
|
| 236 |
+
domain_analysis=domain_analysis,
|
| 237 |
+
suggestions=suggestions if request.enable_suggestions else [],
|
| 238 |
+
improvement_plan=improvement_plan if request.enable_suggestions else [],
|
| 239 |
+
auto_fix_preview=auto_fix_preview if request.enable_suggestions else [],
|
| 240 |
+
score_visualization={
|
| 241 |
+
"reward": score_breakdown.reward,
|
| 242 |
+
"ml_quality": score_breakdown.ml_score,
|
| 243 |
+
"lint_score": score_breakdown.lint_score,
|
| 244 |
+
"maintainability": score_breakdown.maintainability_score,
|
| 245 |
+
"security": score_breakdown.security_score,
|
| 246 |
+
"readability": score_breakdown.readability_score,
|
| 247 |
+
"quality_signal": score_breakdown.quality_signal,
|
| 248 |
+
"error_reduction_signal": score_breakdown.error_reduction_signal,
|
| 249 |
+
"completion_signal": score_breakdown.completion_signal,
|
| 250 |
+
"complexity_penalty": score_breakdown.complexity_penalty,
|
| 251 |
+
},
|
| 252 |
+
model_backend=str(model_prediction["backend_name"]),
|
| 253 |
+
model_id=str(model_prediction["model_id"]),
|
| 254 |
+
summary=summary,
|
| 255 |
+
context_window=request.context_window,
|
| 256 |
+
filename=request.filename,
|
| 257 |
+
analysis_time_ms=round((time.perf_counter() - started) * 1000.0, 2),
|
| 258 |
+
)
|
services/reward_service.py
CHANGED
|
@@ -1,38 +1,56 @@
|
|
| 1 |
-
"""Reward shaping logic for RL-ready code analysis scores."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from schemas.response import ScoreBreakdown
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
)
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reward shaping logic for RL-ready code analysis scores."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from schemas.response import ScoreBreakdown
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _clamp_unit(value: float) -> float:
|
| 9 |
+
return max(0.0, min(1.0, float(value)))
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class RewardService:
|
| 13 |
+
"""Compute reward scores from model, lint, complexity, and issue-risk signals."""
|
| 14 |
+
|
| 15 |
+
def compute(
|
| 16 |
+
self,
|
| 17 |
+
*,
|
| 18 |
+
ml_score: float,
|
| 19 |
+
domain_score: float,
|
| 20 |
+
lint_score: float,
|
| 21 |
+
complexity_penalty: float,
|
| 22 |
+
maintainability_score: float,
|
| 23 |
+
issue_probabilities: dict[str, float],
|
| 24 |
+
) -> ScoreBreakdown:
|
| 25 |
+
"""Apply RL-friendly reward shaping to the code review analysis signals."""
|
| 26 |
+
|
| 27 |
+
security_score = _clamp_unit(1.0 - issue_probabilities.get("security", 0.0))
|
| 28 |
+
readability_score = _clamp_unit((0.6 * lint_score) + (0.4 * maintainability_score))
|
| 29 |
+
quality_signal = _clamp_unit((0.55 * ml_score) + (0.25 * maintainability_score) + (0.20 * domain_score))
|
| 30 |
+
error_reduction_signal = _clamp_unit((0.7 * lint_score) + (0.3 * (1.0 - complexity_penalty)))
|
| 31 |
+
completion_signal = _clamp_unit(
|
| 32 |
+
(0.4 * quality_signal) + (0.25 * readability_score) + (0.2 * security_score) + (0.15 * domain_score)
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
reward = _clamp_unit(
|
| 36 |
+
(0.5 * ml_score)
|
| 37 |
+
+ (0.18 * lint_score)
|
| 38 |
+
+ (0.12 * maintainability_score)
|
| 39 |
+
+ (0.10 * domain_score)
|
| 40 |
+
+ (0.10 * security_score)
|
| 41 |
+
- (0.20 * complexity_penalty)
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
return ScoreBreakdown(
|
| 45 |
+
ml_score=round(ml_score, 4),
|
| 46 |
+
domain_score=round(domain_score, 4),
|
| 47 |
+
lint_score=round(lint_score, 4),
|
| 48 |
+
complexity_penalty=round(complexity_penalty, 4),
|
| 49 |
+
maintainability_score=round(maintainability_score, 4),
|
| 50 |
+
security_score=round(security_score, 4),
|
| 51 |
+
readability_score=round(readability_score, 4),
|
| 52 |
+
quality_signal=round(quality_signal, 4),
|
| 53 |
+
error_reduction_signal=round(error_reduction_signal, 4),
|
| 54 |
+
completion_signal=round(completion_signal, 4),
|
| 55 |
+
reward=round(reward, 4),
|
| 56 |
+
)
|
services/suggestion_service.py
CHANGED
|
@@ -1,28 +1,113 @@
|
|
| 1 |
-
"""Suggestion and improvement-plan generation for analyzed code."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from schemas.response import DomainAnalysis, StaticAnalysisSummary
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class SuggestionService:
|
| 9 |
-
"""Build high-signal improvement
|
| 10 |
-
|
| 11 |
-
def
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Suggestion and improvement-plan generation for analyzed code."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from schemas.response import DomainAnalysis, StaticAnalysisSummary, SuggestionItem
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SuggestionService:
|
| 9 |
+
"""Build high-signal improvement suggestions from analysis output."""
|
| 10 |
+
|
| 11 |
+
def build_suggestions(
|
| 12 |
+
self,
|
| 13 |
+
*,
|
| 14 |
+
domain_analysis: DomainAnalysis,
|
| 15 |
+
static_analysis: StaticAnalysisSummary,
|
| 16 |
+
) -> list[SuggestionItem]:
|
| 17 |
+
"""Return prioritized fixes tailored to the detected review signals."""
|
| 18 |
+
|
| 19 |
+
suggestions: list[SuggestionItem] = []
|
| 20 |
+
|
| 21 |
+
if not static_analysis.syntax_valid:
|
| 22 |
+
suggestions.append(
|
| 23 |
+
SuggestionItem(
|
| 24 |
+
priority="P0",
|
| 25 |
+
title="Fix the syntax error",
|
| 26 |
+
rationale="Static parsing failed, so downstream tests and model signals are less reliable.",
|
| 27 |
+
action=f"Resolve the parser issue first: {static_analysis.syntax_error}.",
|
| 28 |
+
category="correctness",
|
| 29 |
+
)
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
if static_analysis.cyclomatic_complexity >= 6 or static_analysis.max_loop_depth >= 2:
|
| 33 |
+
suggestions.append(
|
| 34 |
+
SuggestionItem(
|
| 35 |
+
priority="P1",
|
| 36 |
+
title="Reduce branching or nested loops",
|
| 37 |
+
rationale="Higher structural complexity makes bugs more likely and lowers the RL reward.",
|
| 38 |
+
action="Extract helper functions or replace repeated scans with a dictionary, set, Counter, or vectorized operation.",
|
| 39 |
+
category="performance",
|
| 40 |
+
)
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
if static_analysis.docstring_coverage == 0 and static_analysis.line_count > 0:
|
| 44 |
+
suggestions.append(
|
| 45 |
+
SuggestionItem(
|
| 46 |
+
priority="P2",
|
| 47 |
+
title="Add function-level documentation",
|
| 48 |
+
rationale="Docstrings improve review speed and make behavior clearer for future edits.",
|
| 49 |
+
action="Document the expected inputs, outputs, and edge cases in a short function docstring.",
|
| 50 |
+
category="style",
|
| 51 |
+
)
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
for issue in domain_analysis.issues[:2]:
|
| 55 |
+
suggestions.append(
|
| 56 |
+
SuggestionItem(
|
| 57 |
+
priority="P1" if issue.severity != "high" else "P0",
|
| 58 |
+
title=issue.title,
|
| 59 |
+
rationale=issue.description,
|
| 60 |
+
action=domain_analysis.suggestions[0] if domain_analysis.suggestions else "Refactor the risky section and re-run analysis.",
|
| 61 |
+
category=issue.category,
|
| 62 |
+
)
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
if not suggestions:
|
| 66 |
+
suggestions.append(
|
| 67 |
+
SuggestionItem(
|
| 68 |
+
priority="P2",
|
| 69 |
+
title="Strengthen review confidence",
|
| 70 |
+
rationale="No severe issues were detected, but explicit edge-case coverage still improves maintainability.",
|
| 71 |
+
action="Add targeted tests for empty input, boundary values, and malformed payloads.",
|
| 72 |
+
category="maintainability",
|
| 73 |
+
)
|
| 74 |
+
)
|
| 75 |
+
return suggestions[:4]
|
| 76 |
+
|
| 77 |
+
def build_improvement_plan(self, *, domain_analysis: DomainAnalysis, static_analysis: StaticAnalysisSummary) -> list[str]:
|
| 78 |
+
"""Return a compact three-step plan optimized for developer action."""
|
| 79 |
+
|
| 80 |
+
primary_issue = (
|
| 81 |
+
domain_analysis.issues[0].description
|
| 82 |
+
if domain_analysis.issues
|
| 83 |
+
else "Stabilize correctness first and keep the public behavior explicit."
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
step_one = f"Step 1 - Correctness and safety: {primary_issue}"
|
| 87 |
+
step_two = "Step 2 - Edge cases: test empty inputs, boundary values, malformed payloads, and failure-mode behavior explicitly."
|
| 88 |
+
step_three = "Step 3 - Scalability: reduce repeated scans, lower cyclomatic complexity, and benchmark the path on realistic input sizes."
|
| 89 |
+
|
| 90 |
+
if domain_analysis.suggestions:
|
| 91 |
+
step_three = f"{step_three} Priority hint: {domain_analysis.suggestions[0]}"
|
| 92 |
+
if not static_analysis.syntax_valid:
|
| 93 |
+
step_one = f"Step 1 - Correctness and safety: fix the syntax error first ({static_analysis.syntax_error})."
|
| 94 |
+
return [step_one, step_two, step_three]
|
| 95 |
+
|
| 96 |
+
def build_auto_fix_preview(
|
| 97 |
+
self,
|
| 98 |
+
*,
|
| 99 |
+
domain_analysis: DomainAnalysis,
|
| 100 |
+
static_analysis: StaticAnalysisSummary,
|
| 101 |
+
) -> list[str]:
|
| 102 |
+
"""Generate compact auto-fix hints for the UI preview panel."""
|
| 103 |
+
|
| 104 |
+
preview: list[str] = []
|
| 105 |
+
if not static_analysis.syntax_valid:
|
| 106 |
+
preview.append(f"Repair parser failure: {static_analysis.syntax_error}")
|
| 107 |
+
if static_analysis.max_loop_depth >= 2:
|
| 108 |
+
preview.append("Replace nested scans with a precomputed lookup table or aggregation structure.")
|
| 109 |
+
if static_analysis.docstring_coverage == 0:
|
| 110 |
+
preview.append("Add a short docstring describing the function contract and edge cases.")
|
| 111 |
+
if domain_analysis.suggestions:
|
| 112 |
+
preview.append(domain_analysis.suggestions[0])
|
| 113 |
+
return preview[:3]
|
tests/test_inference_runner.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
"""Smoke tests for the strict inference output contract."""
|
| 2 |
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from dataclasses import dataclass, field
|
| 6 |
-
|
| 7 |
-
from app.env.runner import InferenceRunner
|
| 8 |
-
from app.models.inference import AgentDecision, InferenceConfig
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
@dataclass
|
|
@@ -56,6 +57,17 @@ class _FakeAgent:
|
|
| 56 |
return AgentDecision(action_type="submit_solution")
|
| 57 |
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
def test_inference_runner_emits_strict_lines(capsys) -> None:
|
| 60 |
runner = InferenceRunner(InferenceConfig.from_env())
|
| 61 |
runner.agent = _FakeAgent()
|
|
@@ -69,3 +81,38 @@ def test_inference_runner_emits_strict_lines(capsys) -> None:
|
|
| 69 |
"[STEP] step=2 action=submit_solution reward=0.97 done=true error=null",
|
| 70 |
"[END] success=true steps=2 rewards=0.45,0.97",
|
| 71 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Smoke tests for the strict inference output contract."""
|
| 2 |
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
|
| 7 |
+
from app.env.runner import InferenceRunner
|
| 8 |
+
from app.models.inference import AgentDecision, InferenceConfig
|
| 9 |
+
from app.utils.runtime import format_reward
|
| 10 |
|
| 11 |
|
| 12 |
@dataclass
|
|
|
|
| 57 |
return AgentDecision(action_type="submit_solution")
|
| 58 |
|
| 59 |
|
| 60 |
+
class _LowScoreEnv(_FakeEnv):
|
| 61 |
+
def step_result(self, action: object) -> tuple[_FakeObservation, float, bool, dict[str, object]]:
|
| 62 |
+
self._step += 1
|
| 63 |
+
return (
|
| 64 |
+
_FakeObservation("demo_task", 2, 0.60, True, current_code="candidate"),
|
| 65 |
+
0.60,
|
| 66 |
+
True,
|
| 67 |
+
{"last_action_error": None},
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
def test_inference_runner_emits_strict_lines(capsys) -> None:
|
| 72 |
runner = InferenceRunner(InferenceConfig.from_env())
|
| 73 |
runner.agent = _FakeAgent()
|
|
|
|
| 81 |
"[STEP] step=2 action=submit_solution reward=0.97 done=true error=null",
|
| 82 |
"[END] success=true steps=2 rewards=0.45,0.97",
|
| 83 |
]
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def test_inference_runner_marks_low_score_submission_unsuccessful(capsys) -> None:
|
| 87 |
+
runner = InferenceRunner(InferenceConfig.from_env())
|
| 88 |
+
runner.agent = _FakeAgent()
|
| 89 |
+
runner._create_env = lambda: _LowScoreEnv() # type: ignore[method-assign]
|
| 90 |
+
runner.run_task("demo_task")
|
| 91 |
+
|
| 92 |
+
captured = capsys.readouterr().out.strip().splitlines()
|
| 93 |
+
assert captured[-1] == "[END] success=false steps=1 rewards=0.60"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def test_inference_config_prefers_openai_key_for_openai_base_url(monkeypatch) -> None:
|
| 97 |
+
monkeypatch.setenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 98 |
+
monkeypatch.setenv("OPENAI_API_KEY", "openai-key")
|
| 99 |
+
monkeypatch.setenv("HF_TOKEN", "hf-key")
|
| 100 |
+
|
| 101 |
+
config = InferenceConfig.from_env()
|
| 102 |
+
|
| 103 |
+
assert config.api_key == "openai-key"
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def test_inference_config_prefers_hf_key_for_hf_router(monkeypatch) -> None:
|
| 107 |
+
monkeypatch.setenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 108 |
+
monkeypatch.setenv("OPENAI_API_KEY", "openai-key")
|
| 109 |
+
monkeypatch.setenv("HF_TOKEN", "hf-key")
|
| 110 |
+
|
| 111 |
+
config = InferenceConfig.from_env()
|
| 112 |
+
|
| 113 |
+
assert config.api_key == "hf-key"
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def test_reward_formatting_stays_in_strict_two_decimal_interval() -> None:
|
| 117 |
+
assert format_reward(0.999999) == "0.99"
|
| 118 |
+
assert format_reward(0.000001) == "0.01"
|
tests/test_scoring.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from graders import grade_task
|
|
|
|
| 4 |
from models import PythonCodeReviewAction
|
| 5 |
from server.env import PythonCodeReviewEnvironment
|
| 6 |
from tasks import list_tasks
|
|
@@ -10,6 +11,16 @@ def assert_open_unit_interval(value: float) -> None:
|
|
| 10 |
assert 0 < value < 1, f"Invalid score: {value}"
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def test_task_grades_stay_strictly_between_zero_and_one() -> None:
|
| 14 |
for task in list_tasks():
|
| 15 |
starter_grade = grade_task(task, task.starter_code, include_hidden=False)
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from graders import grade_task
|
| 4 |
+
from graders.shared import component_score, final_score_pipeline, safe_score, shaped_score
|
| 5 |
from models import PythonCodeReviewAction
|
| 6 |
from server.env import PythonCodeReviewEnvironment
|
| 7 |
from tasks import list_tasks
|
|
|
|
| 11 |
assert 0 < value < 1, f"Invalid score: {value}"
|
| 12 |
|
| 13 |
|
| 14 |
+
def test_score_helpers_clamp_extremes_into_open_interval() -> None:
|
| 15 |
+
for value in (0.0, 1.0, -999999.0, 999999.0):
|
| 16 |
+
assert_open_unit_interval(safe_score(value))
|
| 17 |
+
assert_open_unit_interval(final_score_pipeline(value))
|
| 18 |
+
|
| 19 |
+
for progress in (0.0, 0.5, 1.0):
|
| 20 |
+
assert_open_unit_interval(shaped_score(progress))
|
| 21 |
+
assert_open_unit_interval(component_score(progress))
|
| 22 |
+
|
| 23 |
+
|
| 24 |
def test_task_grades_stay_strictly_between_zero_and_one() -> None:
|
| 25 |
for task in list_tasks():
|
| 26 |
starter_grade = grade_task(task, task.starter_code, include_hidden=False)
|
utils/ast_parser.py
CHANGED
|
@@ -1,144 +1,248 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import ast
|
| 6 |
-
from
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
if isinstance(node, ast.
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""AST-based parsing helpers for Python code review."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import ast
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass(slots=True)
|
| 11 |
+
class _StructureVisitor(ast.NodeVisitor):
|
| 12 |
+
"""Collect lightweight structural signals from Python source."""
|
| 13 |
+
|
| 14 |
+
imports: set[str] = field(default_factory=set)
|
| 15 |
+
route_decorators: set[str] = field(default_factory=set)
|
| 16 |
+
function_names: list[str] = field(default_factory=list)
|
| 17 |
+
class_names: list[str] = field(default_factory=list)
|
| 18 |
+
code_smells: list[str] = field(default_factory=list)
|
| 19 |
+
branch_count: int = 0
|
| 20 |
+
max_loop_depth: int = 0
|
| 21 |
+
max_nesting_depth: int = 0
|
| 22 |
+
current_loop_depth: int = 0
|
| 23 |
+
current_nesting_depth: int = 0
|
| 24 |
+
recursive_functions: set[str] = field(default_factory=set)
|
| 25 |
+
current_function: str | None = None
|
| 26 |
+
docstring_total: int = 0
|
| 27 |
+
docstring_with_docs: int = 0
|
| 28 |
+
backward_calls: int = 0
|
| 29 |
+
optimizer_step_calls: int = 0
|
| 30 |
+
container_builds: int = 0
|
| 31 |
+
|
| 32 |
+
def visit_Import(self, node: ast.Import) -> None: # noqa: N802
|
| 33 |
+
for alias in node.names:
|
| 34 |
+
self.imports.add(alias.name.split(".")[0])
|
| 35 |
+
self.generic_visit(node)
|
| 36 |
+
|
| 37 |
+
def visit_ImportFrom(self, node: ast.ImportFrom) -> None: # noqa: N802
|
| 38 |
+
if node.module:
|
| 39 |
+
self.imports.add(node.module.split(".")[0])
|
| 40 |
+
self.generic_visit(node)
|
| 41 |
+
|
| 42 |
+
def _push_nesting(self) -> None:
|
| 43 |
+
self.current_nesting_depth += 1
|
| 44 |
+
self.max_nesting_depth = max(self.max_nesting_depth, self.current_nesting_depth)
|
| 45 |
+
|
| 46 |
+
def _pop_nesting(self) -> None:
|
| 47 |
+
self.current_nesting_depth = max(0, self.current_nesting_depth - 1)
|
| 48 |
+
|
| 49 |
+
def _visit_loop(self, node: ast.AST) -> None:
|
| 50 |
+
self.branch_count += 1
|
| 51 |
+
self.current_loop_depth += 1
|
| 52 |
+
self.max_loop_depth = max(self.max_loop_depth, self.current_loop_depth)
|
| 53 |
+
self._push_nesting()
|
| 54 |
+
self.generic_visit(node)
|
| 55 |
+
self._pop_nesting()
|
| 56 |
+
self.current_loop_depth = max(0, self.current_loop_depth - 1)
|
| 57 |
+
|
| 58 |
+
def visit_For(self, node: ast.For) -> None: # noqa: N802
|
| 59 |
+
self._visit_loop(node)
|
| 60 |
+
|
| 61 |
+
def visit_AsyncFor(self, node: ast.AsyncFor) -> None: # noqa: N802
|
| 62 |
+
self._visit_loop(node)
|
| 63 |
+
|
| 64 |
+
def visit_While(self, node: ast.While) -> None: # noqa: N802
|
| 65 |
+
self._visit_loop(node)
|
| 66 |
+
|
| 67 |
+
def visit_If(self, node: ast.If) -> None: # noqa: N802
|
| 68 |
+
self.branch_count += 1
|
| 69 |
+
self._push_nesting()
|
| 70 |
+
self.generic_visit(node)
|
| 71 |
+
self._pop_nesting()
|
| 72 |
+
|
| 73 |
+
def visit_Try(self, node: ast.Try) -> None: # noqa: N802
|
| 74 |
+
self.branch_count += 1
|
| 75 |
+
self._push_nesting()
|
| 76 |
+
self.generic_visit(node)
|
| 77 |
+
self._pop_nesting()
|
| 78 |
+
|
| 79 |
+
def visit_With(self, node: ast.With) -> None: # noqa: N802
|
| 80 |
+
self._push_nesting()
|
| 81 |
+
self.generic_visit(node)
|
| 82 |
+
self._pop_nesting()
|
| 83 |
+
|
| 84 |
+
def visit_AsyncWith(self, node: ast.AsyncWith) -> None: # noqa: N802
|
| 85 |
+
self._push_nesting()
|
| 86 |
+
self.generic_visit(node)
|
| 87 |
+
self._pop_nesting()
|
| 88 |
+
|
| 89 |
+
def visit_comprehension(self, node: ast.comprehension) -> None: # noqa: N802
|
| 90 |
+
self._visit_loop(node)
|
| 91 |
+
|
| 92 |
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None: # noqa: N802
|
| 93 |
+
self.function_names.append(node.name)
|
| 94 |
+
self.docstring_total += 1
|
| 95 |
+
if ast.get_docstring(node):
|
| 96 |
+
self.docstring_with_docs += 1
|
| 97 |
+
prior = self.current_function
|
| 98 |
+
self.current_function = node.name
|
| 99 |
+
for decorator in node.decorator_list:
|
| 100 |
+
decorator_name = self._decorator_name(decorator)
|
| 101 |
+
if decorator_name in {"get", "post", "put", "patch", "delete"}:
|
| 102 |
+
self.route_decorators.add(decorator_name)
|
| 103 |
+
self.generic_visit(node)
|
| 104 |
+
self.current_function = prior
|
| 105 |
+
|
| 106 |
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: # noqa: N802
|
| 107 |
+
self.visit_FunctionDef(node)
|
| 108 |
+
|
| 109 |
+
def visit_ClassDef(self, node: ast.ClassDef) -> None: # noqa: N802
|
| 110 |
+
self.class_names.append(node.name)
|
| 111 |
+
self.generic_visit(node)
|
| 112 |
+
|
| 113 |
+
def visit_Call(self, node: ast.Call) -> None: # noqa: N802
|
| 114 |
+
dotted_name = self._call_name(node.func)
|
| 115 |
+
if dotted_name.endswith(".backward") or dotted_name == "backward":
|
| 116 |
+
self.backward_calls += 1
|
| 117 |
+
if dotted_name.endswith(".step") or dotted_name == "step":
|
| 118 |
+
if "optimizer" in dotted_name:
|
| 119 |
+
self.optimizer_step_calls += 1
|
| 120 |
+
if dotted_name in {"list", "dict", "set", "tuple"}:
|
| 121 |
+
self.container_builds += 1
|
| 122 |
+
if self.current_function and dotted_name == self.current_function:
|
| 123 |
+
self.recursive_functions.add(self.current_function)
|
| 124 |
+
self.generic_visit(node)
|
| 125 |
+
|
| 126 |
+
@staticmethod
|
| 127 |
+
def _call_name(node: ast.AST) -> str:
|
| 128 |
+
if isinstance(node, ast.Name):
|
| 129 |
+
return node.id
|
| 130 |
+
if isinstance(node, ast.Attribute):
|
| 131 |
+
left = _StructureVisitor._call_name(node.value)
|
| 132 |
+
return f"{left}.{node.attr}" if left else node.attr
|
| 133 |
+
return ""
|
| 134 |
+
|
| 135 |
+
@staticmethod
|
| 136 |
+
def _decorator_name(node: ast.AST) -> str:
|
| 137 |
+
if isinstance(node, ast.Call):
|
| 138 |
+
return _StructureVisitor._decorator_name(node.func)
|
| 139 |
+
if isinstance(node, ast.Attribute):
|
| 140 |
+
return node.attr.lower()
|
| 141 |
+
if isinstance(node, ast.Name):
|
| 142 |
+
return node.id.lower()
|
| 143 |
+
return ""
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def _line_smells(lines: list[str]) -> tuple[int, list[int], bool]:
|
| 147 |
+
long_lines = sum(1 for line in lines if len(line) > 88)
|
| 148 |
+
trailing_whitespace_lines = [index + 1 for index, line in enumerate(lines) if line.rstrip() != line]
|
| 149 |
+
tabs_used = any("\t" in line for line in lines)
|
| 150 |
+
return long_lines, trailing_whitespace_lines, tabs_used
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def parse_code_structure(code: str) -> dict[str, Any]:
|
| 154 |
+
"""Extract deterministic syntax, import, and structure signals from Python code."""
|
| 155 |
+
|
| 156 |
+
normalized_code = code or ""
|
| 157 |
+
lines = normalized_code.splitlines()
|
| 158 |
+
long_lines, trailing_whitespace_lines, tabs_used = _line_smells(lines)
|
| 159 |
+
|
| 160 |
+
result: dict[str, Any] = {
|
| 161 |
+
"syntax_valid": True,
|
| 162 |
+
"syntax_error": "",
|
| 163 |
+
"line_count": len(lines),
|
| 164 |
+
"imports": [],
|
| 165 |
+
"function_names": [],
|
| 166 |
+
"class_names": [],
|
| 167 |
+
"long_lines": long_lines,
|
| 168 |
+
"trailing_whitespace_lines": trailing_whitespace_lines,
|
| 169 |
+
"tabs_used": tabs_used,
|
| 170 |
+
"docstring_ratio": 0.0,
|
| 171 |
+
"uses_recursion": False,
|
| 172 |
+
"max_loop_depth": 0,
|
| 173 |
+
"max_nesting_depth": 0,
|
| 174 |
+
"route_decorators": [],
|
| 175 |
+
"code_smells": [],
|
| 176 |
+
"uses_pandas": False,
|
| 177 |
+
"uses_numpy": False,
|
| 178 |
+
"uses_torch": False,
|
| 179 |
+
"uses_sklearn": False,
|
| 180 |
+
"uses_fastapi": False,
|
| 181 |
+
"uses_flask": False,
|
| 182 |
+
"uses_pydantic": False,
|
| 183 |
+
"calls_backward": False,
|
| 184 |
+
"calls_optimizer_step": False,
|
| 185 |
+
"branch_count": 0,
|
| 186 |
+
"container_builds": 0,
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
try:
|
| 190 |
+
tree = ast.parse(normalized_code or "\n")
|
| 191 |
+
except SyntaxError as exc:
|
| 192 |
+
result["syntax_valid"] = False
|
| 193 |
+
result["syntax_error"] = f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
|
| 194 |
+
result["code_smells"] = ["Code does not parse.", "Fix syntax before deeper review."]
|
| 195 |
+
return result
|
| 196 |
+
|
| 197 |
+
visitor = _StructureVisitor()
|
| 198 |
+
visitor.visit(tree)
|
| 199 |
+
|
| 200 |
+
imports = sorted(visitor.imports)
|
| 201 |
+
uses_pandas = "pandas" in imports or "pd" in normalized_code
|
| 202 |
+
uses_numpy = "numpy" in imports or "np." in normalized_code
|
| 203 |
+
uses_torch = "torch" in imports or "torch." in normalized_code
|
| 204 |
+
uses_sklearn = "sklearn" in imports
|
| 205 |
+
uses_fastapi = "fastapi" in imports
|
| 206 |
+
uses_flask = "flask" in imports
|
| 207 |
+
uses_pydantic = "pydantic" in imports or "BaseModel" in normalized_code
|
| 208 |
+
|
| 209 |
+
code_smells = list(visitor.code_smells)
|
| 210 |
+
if visitor.max_loop_depth >= 2:
|
| 211 |
+
code_smells.append("Nested loops may create avoidable performance pressure.")
|
| 212 |
+
if long_lines:
|
| 213 |
+
code_smells.append("Long lines reduce readability and reviewability.")
|
| 214 |
+
if trailing_whitespace_lines:
|
| 215 |
+
code_smells.append("Trailing whitespace suggests style drift.")
|
| 216 |
+
if visitor.docstring_total and visitor.docstring_with_docs == 0:
|
| 217 |
+
code_smells.append("Public functions are missing docstrings.")
|
| 218 |
+
if not visitor.function_names:
|
| 219 |
+
code_smells.append("Encapsulate behavior in functions for testability.")
|
| 220 |
+
|
| 221 |
+
result.update(
|
| 222 |
+
{
|
| 223 |
+
"imports": imports,
|
| 224 |
+
"function_names": visitor.function_names,
|
| 225 |
+
"class_names": visitor.class_names,
|
| 226 |
+
"docstring_ratio": round(
|
| 227 |
+
visitor.docstring_with_docs / max(visitor.docstring_total, 1),
|
| 228 |
+
4,
|
| 229 |
+
),
|
| 230 |
+
"uses_recursion": bool(visitor.recursive_functions),
|
| 231 |
+
"max_loop_depth": visitor.max_loop_depth,
|
| 232 |
+
"max_nesting_depth": visitor.max_nesting_depth,
|
| 233 |
+
"route_decorators": sorted(visitor.route_decorators),
|
| 234 |
+
"code_smells": code_smells,
|
| 235 |
+
"uses_pandas": uses_pandas,
|
| 236 |
+
"uses_numpy": uses_numpy,
|
| 237 |
+
"uses_torch": uses_torch,
|
| 238 |
+
"uses_sklearn": uses_sklearn,
|
| 239 |
+
"uses_fastapi": uses_fastapi,
|
| 240 |
+
"uses_flask": uses_flask,
|
| 241 |
+
"uses_pydantic": uses_pydantic,
|
| 242 |
+
"calls_backward": visitor.backward_calls > 0,
|
| 243 |
+
"calls_optimizer_step": visitor.optimizer_step_calls > 0,
|
| 244 |
+
"branch_count": visitor.branch_count,
|
| 245 |
+
"container_builds": visitor.container_builds,
|
| 246 |
+
}
|
| 247 |
+
)
|
| 248 |
+
return result
|
utils/complexity.py
CHANGED
|
@@ -1,37 +1,70 @@
|
|
| 1 |
-
"""Complexity heuristics for
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
return
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Complexity heuristics for Python code review."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import ast
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _clamp_unit(value: float) -> float:
|
| 10 |
+
return max(0.0, min(1.0, float(value)))
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _estimate_time_complexity(loop_depth: int, uses_recursion: bool) -> str:
|
| 14 |
+
if uses_recursion and loop_depth >= 1:
|
| 15 |
+
return "O(n^2)"
|
| 16 |
+
if loop_depth >= 3:
|
| 17 |
+
return "O(n^3)"
|
| 18 |
+
if loop_depth == 2:
|
| 19 |
+
return "O(n^2)"
|
| 20 |
+
if loop_depth == 1:
|
| 21 |
+
return "O(n)"
|
| 22 |
+
if uses_recursion:
|
| 23 |
+
return "O(n)"
|
| 24 |
+
return "O(1)"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _estimate_space_complexity(code: str, uses_recursion: bool) -> str:
|
| 28 |
+
if uses_recursion:
|
| 29 |
+
return "O(n)"
|
| 30 |
+
if any(token in code for token in ("[]", "{}", "set(", "dict(", "list(", "Counter(")):
|
| 31 |
+
return "O(n)"
|
| 32 |
+
return "O(1)"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _cyclomatic_complexity(code: str) -> int:
|
| 36 |
+
try:
|
| 37 |
+
tree = ast.parse(code or "\n")
|
| 38 |
+
except SyntaxError:
|
| 39 |
+
return 1
|
| 40 |
+
decision_points = sum(
|
| 41 |
+
isinstance(node, (ast.If, ast.For, ast.AsyncFor, ast.While, ast.Try, ast.ExceptHandler, ast.Match, ast.BoolOp))
|
| 42 |
+
for node in ast.walk(tree)
|
| 43 |
+
)
|
| 44 |
+
return max(1, 1 + decision_points)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def estimate_complexity(parsed: dict[str, Any], code: str) -> dict[str, Any]:
|
| 48 |
+
"""Estimate Python complexity signals from parsed structure plus source text."""
|
| 49 |
+
|
| 50 |
+
cyclomatic_complexity = _cyclomatic_complexity(code)
|
| 51 |
+
loop_depth = int(parsed.get("max_loop_depth", 0) or 0)
|
| 52 |
+
max_nesting_depth = int(parsed.get("max_nesting_depth", 0) or 0)
|
| 53 |
+
uses_recursion = bool(parsed.get("uses_recursion", False))
|
| 54 |
+
line_count = int(parsed.get("line_count", 0) or 0)
|
| 55 |
+
|
| 56 |
+
complexity_penalty = _clamp_unit(
|
| 57 |
+
0.08
|
| 58 |
+
+ min(cyclomatic_complexity, 12) * 0.045
|
| 59 |
+
+ min(loop_depth, 4) * 0.11
|
| 60 |
+
+ min(max_nesting_depth, 4) * 0.06
|
| 61 |
+
+ (0.06 if uses_recursion else 0.0)
|
| 62 |
+
+ min(line_count, 200) * 0.0009
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
return {
|
| 66 |
+
"cyclomatic_complexity": cyclomatic_complexity,
|
| 67 |
+
"time_complexity": _estimate_time_complexity(loop_depth, uses_recursion),
|
| 68 |
+
"space_complexity": _estimate_space_complexity(code, uses_recursion),
|
| 69 |
+
"complexity_penalty": round(complexity_penalty, 4),
|
| 70 |
+
}
|
uv.lock
CHANGED
|
@@ -1926,7 +1926,6 @@ source = { editable = "." }
|
|
| 1926 |
dependencies = [
|
| 1927 |
{ name = "fastapi" },
|
| 1928 |
{ name = "gradio" },
|
| 1929 |
-
{ name = "hf-xet" },
|
| 1930 |
{ name = "openai" },
|
| 1931 |
{ name = "openenv-core", extra = ["core"] },
|
| 1932 |
{ name = "streamlit" },
|
|
@@ -1945,7 +1944,6 @@ dev = [
|
|
| 1945 |
requires-dist = [
|
| 1946 |
{ name = "fastapi", specifier = ">=0.111.0" },
|
| 1947 |
{ name = "gradio", specifier = ">=5.26.0" },
|
| 1948 |
-
{ name = "hf-xet", specifier = ">=1.4.3" },
|
| 1949 |
{ name = "openai", specifier = ">=1.76.0" },
|
| 1950 |
{ name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
|
| 1951 |
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
|
|
|
|
| 1926 |
dependencies = [
|
| 1927 |
{ name = "fastapi" },
|
| 1928 |
{ name = "gradio" },
|
|
|
|
| 1929 |
{ name = "openai" },
|
| 1930 |
{ name = "openenv-core", extra = ["core"] },
|
| 1931 |
{ name = "streamlit" },
|
|
|
|
| 1944 |
requires-dist = [
|
| 1945 |
{ name = "fastapi", specifier = ">=0.111.0" },
|
| 1946 |
{ name = "gradio", specifier = ">=5.26.0" },
|
|
|
|
| 1947 |
{ name = "openai", specifier = ">=1.76.0" },
|
| 1948 |
{ name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
|
| 1949 |
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
|