| """ |
| Handle submissions to the GuardBench leaderboard. |
| """ |
|
|
| import json |
| import os |
| import tempfile |
| from datetime import datetime |
| from typing import Dict, List, Tuple |
| import shutil |
| import threading |
| import time |
|
|
| from huggingface_hub import HfApi |
| from datasets import load_dataset |
| import subprocess |
|
|
| from src.display.formatting import styled_error, styled_message |
| from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID |
| from src.leaderboard.processor import process_jsonl_submission |
|
|
| try: |
| from circleguardbench.evaluator import Evaluator |
| from circleguardbench.context import GuardbenchContext |
| from circleguardbench.models_config import ModelType |
| GUARDBENCH_AVAILABLE = True |
| except ImportError: |
| GUARDBENCH_AVAILABLE = False |
|
|
|
|
| def validate_submission(file_path: str) -> Tuple[bool, str]: |
| """ |
| Validate a submission file. |
| """ |
| try: |
| entries, message = process_jsonl_submission(file_path) |
| if not entries: |
| return False, message |
| return True, "Submission is valid" |
| except Exception as e: |
| return False, f"Error validating submission: {e}" |
|
|
|
|
| def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]: |
| """ |
| Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version. |
| """ |
| try: |
| |
| model_name_safe = model_name.replace("/", "_").replace(" ", "_") |
| mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower() |
|
|
| |
| entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json" |
|
|
| |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file: |
| json.dump(entry, temp_file, indent=2) |
| temp_path = temp_file.name |
|
|
| |
| api = HfApi(token=TOKEN) |
| api.upload_file( |
| path_or_fileobj=temp_path, |
| path_in_repo=entry_path, |
| repo_id=RESULTS_DATASET_ID, |
| repo_type="dataset", |
| commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})" |
| ) |
|
|
| os.unlink(temp_path) |
| return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})" |
| except Exception as e: |
| return False, f"Error submitting entry to dataset: {e}" |
|
|
|
|
| def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]: |
| """ |
| Submit updated leaderboard to the HuggingFace dataset. |
| """ |
| try: |
| |
| leaderboard_data = { |
| "entries": entries, |
| "last_updated": datetime.now().isoformat(), |
| "version": version |
| } |
|
|
| |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file: |
| json.dump(leaderboard_data, temp_file, indent=2) |
| temp_path = temp_file.name |
|
|
| |
| api = HfApi(token=TOKEN) |
| api.upload_file( |
| path_or_fileobj=temp_path, |
| path_in_repo=f"leaderboards/leaderboard_{version}.json", |
| repo_id=RESULTS_DATASET_ID, |
| repo_type="dataset", |
| commit_message=f"Update leaderboard for version {version}" |
| ) |
|
|
| os.unlink(temp_path) |
| return True, "Leaderboard updated successfully" |
| except Exception as e: |
| return False, f"Error updating leaderboard: {e}" |
|
|
|
|
| def restart_space_after_delay(delay_seconds: int = 2) -> None: |
| """ |
| Restart the Hugging Face Space after a delay. |
| """ |
| def _restart_space(): |
| time.sleep(delay_seconds) |
| try: |
| api = HfApi(token=TOKEN) |
| api.restart_space(repo_id=REPO_ID) |
| except Exception as e: |
| print(f"Error restarting space: {e}") |
|
|
| |
| thread = threading.Thread(target=_restart_space) |
| thread.daemon = True |
| thread.start() |
|
|
|
|
| def process_submission(file_path: str, metadata: Dict, version="v0") -> str: |
| """ |
| Process a submission to the GuardBench leaderboard. |
| """ |
| if not GUARDBENCH_AVAILABLE: |
| return styled_error("circleguardbench package is not installed. Submission processing is unavailable.") |
|
|
| try: |
| |
| is_valid, validation_message = validate_submission(file_path) |
| if not is_valid: |
| return styled_error(validation_message) |
|
|
| |
| guardbench_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "guard-bench-submodule") |
| results_dir = os.path.join(guardbench_dir, "results") |
| os.makedirs(results_dir, exist_ok=True) |
|
|
| |
| model_name = metadata.get("model_name", "unknown") |
| model_name_safe = model_name.replace("/", "_").replace(" ", "_") |
| guard_model_type = metadata.get("guard_model_type", "unknown") |
| target_file = os.path.join(results_dir + "/circleguardbench_public", f"{model_name_safe}.jsonl") |
|
|
| |
| api = HfApi(token=TOKEN) |
| submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl" |
| api.upload_file( |
| path_or_fileobj=file_path, |
| path_in_repo=submission_path, |
| repo_id=RESULTS_DATASET_ID, |
| repo_type="dataset", |
| commit_message=f"Add raw submission for {model_name}" |
| ) |
| os.makedirs(results_dir + "/circleguardbench_public", exist_ok=True) |
|
|
| |
| |
| |
|
|
|
|
| shutil.copy2(file_path, target_file) |
| |
| |
|
|
| try: |
| |
| ctx = GuardbenchContext() |
| |
| ctx.results_dir = results_dir |
| |
| ctx.bench_name = "circleguardbench_public" |
| |
| ctx.load_dataset("whitecircle-ai/circleguardbench_public") |
| |
| ctx.is_initialized = True |
|
|
| evaluator = Evaluator(ctx, force=True, using_cached=True) |
|
|
| |
| evaluator.evaluate_model(model_name_safe, str(guard_model_type).lower()) |
|
|
| |
| with open(os.path.join(results_dir + "/" + ctx.bench_name, "leaderboard.json"), 'r') as f: |
| results_data = json.load(f) |
| model_entry = next( |
| (entry for entry in results_data.get("entries", []) |
| if entry.get("model_name") == model_name_safe), |
| None |
| ) |
|
|
| if not model_entry: |
| return styled_error("No evaluation results found") |
|
|
| |
| model_entry.update({ |
| "model_name": metadata.get("model_name"), |
| "model_type": metadata.get("model_type"), |
| "guard_model_type": str(metadata.get("guard_model_type")).lower(), |
| "mode": metadata.get("mode"), |
| "base_model": metadata.get("base_model"), |
| "revision": metadata.get("revision"), |
| "precision": metadata.get("precision"), |
| "weight_type": metadata.get("weight_type"), |
| "version": version, |
| "submission_date": datetime.now().isoformat() |
| }) |
|
|
| |
| success, message = submit_entry_to_hub(model_entry, model_name, metadata.get("mode"), version) |
| if not success: |
| return styled_error(message) |
|
|
| |
| api = HfApi(token=TOKEN) |
| files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset") |
| entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")] |
|
|
| all_entries = [] |
| for entry_file in entry_files: |
| try: |
| entry_path = api.hf_hub_download( |
| repo_id=RESULTS_DATASET_ID, |
| filename=entry_file, |
| repo_type="dataset", |
| ) |
| with open(entry_path, 'r') as f: |
| entry_data = json.load(f) |
| all_entries.append(entry_data) |
| except Exception as e: |
| print(f"Error loading entry {entry_file}: {e}") |
|
|
| |
| success, message = submit_leaderboard_to_hub(all_entries, version) |
| if not success: |
| return styled_error(message) |
|
|
| restart_space_after_delay(5) |
|
|
| return styled_message("Submission successful! Model evaluated and leaderboard updated.") |
|
|
| except Exception as eval_error: |
| return styled_error(f"Error during evaluation: {eval_error}") |
|
|
| except Exception as e: |
| return styled_error(f"Error processing submission: {e}") |
| finally: |
| |
| try: |
| if os.path.exists(file_path): |
| os.remove(file_path) |
| if os.path.exists(target_file): |
| os.remove(target_file) |
| except: |
| pass |
|
|