avichauhan commited on
Commit
79ff00b
·
verified ·
1 Parent(s): 29d5796

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. server/environment.py +24 -12
  2. validate-submission.sh +185 -0
server/environment.py CHANGED
@@ -356,30 +356,40 @@ class APIDebugEnvironment(Environment):
356
 
357
  client = OpenAI(base_url=api_base, api_key=api_key)
358
 
359
- error_types = [gt["error_type"] for gt in self.ground_truths]
 
 
 
 
360
  prompt = (
361
- "Rate this API debugging explanation on a 0.0 to 1.0 scale.\n\n"
362
- "Criteria:\n"
363
- "- Correctly identifies root cause (0 to 0.4)\n"
364
- "- Provides actionable fix guidance (0 to 0.3)\n"
365
- "- Includes prevention advice for developers (0 to 0.3)\n\n"
366
- f"API: {self.spec['api_name']} {self.spec['endpoint']}\n"
367
- f"Errors present: {json.dumps(error_types)}\n"
368
- f"Explanation: {explanation}\n\n"
369
  'Return ONLY a JSON object: {"score": 0.0}'
370
  )
371
 
 
372
  response = client.chat.completions.create(
373
  model=model,
374
  messages=[{"role": "user", "content": prompt}],
375
  max_tokens=50,
376
  temperature=0.0,
 
377
  )
378
  text = response.choices[0].message.content or ""
379
 
380
- # Parse score from response
381
- result = json.loads(text)
382
- raw_score = float(result["score"])
 
 
 
 
383
  return max(0.0, min(1.0, raw_score))
384
 
385
  def _heuristic_score_explanation(self, explanation: str) -> float:
@@ -392,6 +402,8 @@ class APIDebugEnvironment(Environment):
392
  "because", "should", "instead", "required", "missing",
393
  "type", "format", "expected", "invalid", "correct",
394
  "field", "header", "value", "fix", "error",
 
 
395
  ]
396
  keyword_hits = sum(1 for k in keywords if k in explanation.lower())
397
  keyword_score = min(keyword_hits / 6.0, 1.0)
 
356
 
357
  client = OpenAI(base_url=api_base, api_key=api_key)
358
 
359
+ # Include error type + affected fields so the judge evaluates against actual errors
360
+ gt_summary = [
361
+ {"error_type": gt["error_type"], "affected_fields": gt.get("affected_fields", [])}
362
+ for gt in self.ground_truths
363
+ ]
364
  prompt = (
365
+ "You are grading an AI agent's explanation for debugging a broken API request.\n\n"
366
+ f"API: {self.spec['api_name']} {self.spec['http_method']} {self.spec['endpoint']}\n"
367
+ f"Actual errors present:\n{json.dumps(gt_summary, indent=2)}\n\n"
368
+ f"Agent's explanation:\n{explanation}\n\n"
369
+ "Score 0.0 to 1.0:\n"
370
+ "- Root cause: correctly names the error type and affected fields (0 to 0.4)\n"
371
+ "- Fix guidance: explains the correct remediation (0 to 0.3)\n"
372
+ "- Developer clarity: actionable and clear for a developer (0 to 0.3)\n\n"
373
  'Return ONLY a JSON object: {"score": 0.0}'
374
  )
375
 
376
+ # timeout=10 prevents blocking step() if the judge LLM is slow
377
  response = client.chat.completions.create(
378
  model=model,
379
  messages=[{"role": "user", "content": prompt}],
380
  max_tokens=50,
381
  temperature=0.0,
382
+ timeout=10,
383
  )
384
  text = response.choices[0].message.content or ""
385
 
386
+ # Parse score from response - protected so any bad response falls back to heuristic
387
+ try:
388
+ result = json.loads(text)
389
+ raw_score = float(result["score"])
390
+ except (json.JSONDecodeError, KeyError, TypeError, ValueError):
391
+ return None
392
+
393
  return max(0.0, min(1.0, raw_score))
394
 
395
  def _heuristic_score_explanation(self, explanation: str) -> float:
 
402
  "because", "should", "instead", "required", "missing",
403
  "type", "format", "expected", "invalid", "correct",
404
  "field", "header", "value", "fix", "error",
405
+ "authorization", "authentication", "schema", "endpoint",
406
+ "method", "body", "payload", "constraint",
407
  ]
408
  keyword_hits = sum(1 for k in keywords if k in explanation.lower())
409
  keyword_score = min(keyword_hits / 6.0, 1.0)
validate-submission.sh ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Prerequisites:
8
+ # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core
10
+ # - curl (usually pre-installed)
11
+ #
12
+ # Run:
13
+ # curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
14
+ #
15
+ # Or download and run locally:
16
+ # chmod +x validate-submission.sh
17
+ # ./validate-submission.sh <ping_url> [repo_dir]
18
+ #
19
+ # Arguments:
20
+ # ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
21
+ # repo_dir Path to your repo (default: current directory)
22
+ #
23
+ # Examples:
24
+ # ./validate-submission.sh https://my-team.hf.space
25
+ # ./validate-submission.sh https://my-team.hf.space ./my-repo
26
+ #
27
+
28
+ set -uo pipefail
29
+
30
+ DOCKER_BUILD_TIMEOUT=600
31
+ if [ -t 1 ]; then
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ BOLD='\033[1m'
36
+ NC='\033[0m'
37
+ else
38
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
39
+ fi
40
+
41
+ run_with_timeout() {
42
+ local secs="$1"; shift
43
+ if command -v timeout &>/dev/null; then
44
+ timeout "$secs" "$@"
45
+ elif command -v gtimeout &>/dev/null; then
46
+ gtimeout "$secs" "$@"
47
+ else
48
+ "$@" &
49
+ local pid=$!
50
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
51
+ local watcher=$!
52
+ wait "$pid" 2>/dev/null
53
+ local rc=$?
54
+ kill "$watcher" 2>/dev/null
55
+ wait "$watcher" 2>/dev/null
56
+ return $rc
57
+ fi
58
+ }
59
+
60
+ portable_mktemp() {
61
+ local prefix="${1:-validate}"
62
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
+ }
64
+
65
+ CLEANUP_FILES=()
66
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
+ trap cleanup EXIT
68
+
69
+ PING_URL="${1:-}"
70
+ REPO_DIR="${2:-.}"
71
+
72
+ if [ -z "$PING_URL" ]; then
73
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
74
+ printf "\n"
75
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
76
+ printf " repo_dir Path to your repo (default: current directory)\n"
77
+ exit 1
78
+ fi
79
+
80
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
81
+ printf "Error: directory '%s' not found\n" "${2:-.}"
82
+ exit 1
83
+ fi
84
+ PING_URL="${PING_URL%/}"
85
+ export PING_URL
86
+ PASS=0
87
+
88
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
89
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
90
+ fail() { log "${RED}FAILED${NC} -- $1"; }
91
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
92
+ stop_at() {
93
+ printf "\n"
94
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
95
+ exit 1
96
+ }
97
+
98
+ printf "\n"
99
+ printf "${BOLD}========================================${NC}\n"
100
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
101
+ printf "${BOLD}========================================${NC}\n"
102
+ log "Repo: $REPO_DIR"
103
+ log "Ping URL: $PING_URL"
104
+ printf "\n"
105
+
106
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
107
+
108
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
109
+ CLEANUP_FILES+=("$CURL_OUTPUT")
110
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
111
+ -H "Content-Type: application/json" -d '{}' \
112
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
113
+
114
+ if [ "$HTTP_CODE" = "200" ]; then
115
+ pass "HF Space is live and responds to /reset"
116
+ elif [ "$HTTP_CODE" = "000" ]; then
117
+ fail "HF Space not reachable (connection failed or timed out)"
118
+ hint "Check your network connection and that the Space is running."
119
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
120
+ stop_at "Step 1"
121
+ else
122
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
123
+ hint "Make sure your Space is running and the URL is correct."
124
+ hint "Try opening $PING_URL in your browser first."
125
+ stop_at "Step 1"
126
+ fi
127
+
128
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
129
+
130
+ if ! command -v docker &>/dev/null; then
131
+ fail "docker command not found"
132
+ hint "Install Docker: https://docs.docker.com/get-docker/"
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
137
+ DOCKER_CONTEXT="$REPO_DIR"
138
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
+ DOCKER_CONTEXT="$REPO_DIR/server"
140
+ else
141
+ fail "No Dockerfile found in repo root or server/ directory"
142
+ stop_at "Step 2"
143
+ fi
144
+
145
+ log " Found Dockerfile in $DOCKER_CONTEXT"
146
+
147
+ BUILD_OK=false
148
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
+
150
+ if [ "$BUILD_OK" = true ]; then
151
+ pass "Docker build succeeded"
152
+ else
153
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
154
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
155
+ stop_at "Step 2"
156
+ fi
157
+
158
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
+
160
+ if ! command -v openenv &>/dev/null; then
161
+ fail "openenv command not found"
162
+ hint "Install it: pip install openenv-core"
163
+ stop_at "Step 3"
164
+ fi
165
+
166
+ VALIDATE_OK=false
167
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
+
169
+ if [ "$VALIDATE_OK" = true ]; then
170
+ pass "openenv validate passed"
171
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
172
+ else
173
+ fail "openenv validate failed"
174
+ printf "%s\n" "$VALIDATE_OUTPUT"
175
+ stop_at "Step 3"
176
+ fi
177
+
178
+ printf "\n"
179
+ printf "${BOLD}========================================${NC}\n"
180
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
181
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
182
+ printf "${BOLD}========================================${NC}\n"
183
+ printf "\n"
184
+
185
+ exit 0