Spaces:

avichauhan
/

api-debug-env

Running

App Files Files Community

avichauhan commited on 15 days ago

Commit

79ff00b

verified ·

1 Parent(s): 29d5796

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

server/environment.py +24 -12
validate-submission.sh +185 -0

server/environment.py CHANGED Viewed

@@ -356,30 +356,40 @@ class APIDebugEnvironment(Environment):
         client = OpenAI(base_url=api_base, api_key=api_key)
-        error_types = [gt["error_type"] for gt in self.ground_truths]
         prompt = (
-            "Rate this API debugging explanation on a 0.0 to 1.0 scale.\n\n"
-            "Criteria:\n"
-            "- Correctly identifies root cause (0 to 0.4)\n"
-            "- Provides actionable fix guidance (0 to 0.3)\n"
-            "- Includes prevention advice for developers (0 to 0.3)\n\n"
-            f"API: {self.spec['api_name']} {self.spec['endpoint']}\n"
-            f"Errors present: {json.dumps(error_types)}\n"
-            f"Explanation: {explanation}\n\n"
             'Return ONLY a JSON object: {"score": 0.0}'
         )
         response = client.chat.completions.create(
             model=model,
             messages=[{"role": "user", "content": prompt}],
             max_tokens=50,
             temperature=0.0,
         )
         text = response.choices[0].message.content or ""
-        # Parse score from response
-        result = json.loads(text)
-        raw_score = float(result["score"])
         return max(0.0, min(1.0, raw_score))
     def _heuristic_score_explanation(self, explanation: str) -> float:
@@ -392,6 +402,8 @@ class APIDebugEnvironment(Environment):
             "because", "should", "instead", "required", "missing",
             "type", "format", "expected", "invalid", "correct",
             "field", "header", "value", "fix", "error",
         ]
         keyword_hits = sum(1 for k in keywords if k in explanation.lower())
         keyword_score = min(keyword_hits / 6.0, 1.0)

         client = OpenAI(base_url=api_base, api_key=api_key)
+        # Include error type + affected fields so the judge evaluates against actual errors
+        gt_summary = [
+            {"error_type": gt["error_type"], "affected_fields": gt.get("affected_fields", [])}
+            for gt in self.ground_truths
+        ]
         prompt = (
+            "You are grading an AI agent's explanation for debugging a broken API request.\n\n"
+            f"API: {self.spec['api_name']} {self.spec['http_method']} {self.spec['endpoint']}\n"
+            f"Actual errors present:\n{json.dumps(gt_summary, indent=2)}\n\n"
+            f"Agent's explanation:\n{explanation}\n\n"
+            "Score 0.0 to 1.0:\n"
+            "- Root cause: correctly names the error type and affected fields (0 to 0.4)\n"
+            "- Fix guidance: explains the correct remediation (0 to 0.3)\n"
+            "- Developer clarity: actionable and clear for a developer (0 to 0.3)\n\n"
             'Return ONLY a JSON object: {"score": 0.0}'
         )
+        # timeout=10 prevents blocking step() if the judge LLM is slow
         response = client.chat.completions.create(
             model=model,
             messages=[{"role": "user", "content": prompt}],
             max_tokens=50,
             temperature=0.0,
+            timeout=10,
         )
         text = response.choices[0].message.content or ""
+        # Parse score from response - protected so any bad response falls back to heuristic
+        try:
+            result = json.loads(text)
+            raw_score = float(result["score"])
+        except (json.JSONDecodeError, KeyError, TypeError, ValueError):
+            return None
         return max(0.0, min(1.0, raw_score))
     def _heuristic_score_explanation(self, explanation: str) -> float:
             "because", "should", "instead", "required", "missing",
             "type", "format", "expected", "invalid", "correct",
             "field", "header", "value", "fix", "error",
+            "authorization", "authentication", "schema", "endpoint",
+            "method", "body", "payload", "constraint",
         ]
         keyword_hits = sum(1 for k in keywords if k in explanation.lower())
         keyword_score = min(keyword_hits / 6.0, 1.0)

validate-submission.sh ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env bash
+#
+# validate-submission.sh — OpenEnv Submission Validator
+#
+# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
+#
+# Prerequisites:
+#   - Docker:       https://docs.docker.com/get-docker/
+#   - openenv-core: pip install openenv-core
+#   - curl (usually pre-installed)
+#
+# Run:
+#   curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
+#
+#   Or download and run locally:
+#     chmod +x validate-submission.sh
+#     ./validate-submission.sh <ping_url> [repo_dir]
+#
+# Arguments:
+#   ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)
+#   repo_dir   Path to your repo (default: current directory)
+#
+# Examples:
+#   ./validate-submission.sh https://my-team.hf.space
+#   ./validate-submission.sh https://my-team.hf.space ./my-repo
+#
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+export PING_URL
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it: pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Your submission is ready to submit.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0