Spaces:

Somin-Aggarwal
/

AnnotationReviewer

Runtime error

App Files Files Community

Somin-Aggarwal commited on Apr 8

Commit

01a014b

verified ·

1 Parent(s): f84ee46

Upload 3 files

Browse files

Files changed (3) hide show

server/corruption.py +251 -0
server/environment.py +499 -0
server/grader.py +148 -0

server/corruption.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Annotation corruption strategies for the Annotation QA Environment.
+Takes gold-standard COCO annotations and systematically corrupts them to create
+data with known errors. The corruption is deterministic given a seed.
+Corruption types by difficulty:
+- Task 1 (Easy): Obvious bbox errors — expand, shift, delete, add spurious
+- Task 2 (Medium): bbox + class errors — similar class confusion, boundary errors
+- Task 3 (Hard): Cross-image inconsistencies + subtle errors
+"""
+import copy
+import random
+from typing import Dict, List, Tuple
+# ──────────────────────────────────────────────
+# COCO 80 categories
+# ──────────────────────────────────────────────
+ALL_CLASSES = [
+    "person", "bicycle", "car", "motorcycle", "airplane",
+    "bus", "train", "truck", "boat", "traffic light",
+    "fire hydrant", "stop sign", "parking meter", "bench",
+    "bird", "cat", "dog", "horse", "sheep",
+    "cow", "elephant", "bear", "zebra", "giraffe",
+    "backpack", "umbrella", "handbag", "tie", "suitcase",
+    "frisbee", "skis", "snowboard", "sports ball", "kite",
+    "baseball bat", "baseball glove", "skateboard", "surfboard",
+    "tennis racket", "bottle", "wine glass", "cup",
+    "fork", "knife", "spoon", "bowl", "banana",
+    "apple", "sandwich", "orange", "broccoli", "carrot",
+    "hot dog", "pizza", "donut", "cake", "chair",
+    "couch", "potted plant", "bed", "dining table",
+    "toilet", "tv", "laptop", "mouse", "remote",
+    "keyboard", "cell phone", "microwave", "oven",
+    "toaster", "sink", "refrigerator", "book", "clock",
+    "vase", "scissors", "teddy bear", "hair drier",
+    "toothbrush",
+]
+# Class confusion maps — COCO-specific similar category pairs
+SIMILAR_CLASSES: Dict[str, List[str]] = {
+    "car": ["truck", "bus"],
+    "truck": ["car", "bus"],
+    "bus": ["truck", "car"],
+    "motorcycle": ["bicycle"],
+    "bicycle": ["motorcycle"],
+    "dog": ["cat", "horse"],
+    "cat": ["dog"],
+    "horse": ["cow", "dog"],
+    "cow": ["horse", "sheep"],
+    "sheep": ["cow"],
+    "elephant": ["bear"],
+    "bear": ["elephant"],
+    "zebra": ["giraffe", "horse"],
+    "giraffe": ["zebra"],
+    "bird": ["airplane", "kite"],
+    "airplane": ["bird", "kite"],
+    "chair": ["couch", "bench"],
+    "couch": ["chair", "bed"],
+    "bed": ["couch"],
+    "bench": ["chair"],
+    "dining table": ["bed"],
+    "bottle": ["cup", "wine glass", "vase"],
+    "cup": ["bottle", "wine glass", "bowl"],
+    "wine glass": ["cup", "bottle"],
+    "bowl": ["cup"],
+    "fork": ["knife", "spoon"],
+    "knife": ["fork", "spoon", "scissors"],
+    "spoon": ["fork", "knife"],
+    "scissors": ["knife"],
+    "banana": ["hot dog"],
+    "hot dog": ["banana", "sandwich"],
+    "pizza": ["cake", "donut"],
+    "donut": ["pizza", "cake", "apple", "orange"],
+    "cake": ["pizza", "donut"],
+    "apple": ["orange", "donut", "sports ball"],
+    "orange": ["apple", "donut", "sports ball"],
+    "sandwich": ["hot dog", "pizza"],
+    "broccoli": ["potted plant"],
+    "carrot": ["banana"],
+    "potted plant": ["broccoli", "vase"],
+    "tv": ["laptop", "microwave"],
+    "laptop": ["tv", "keyboard"],
+    "keyboard": ["laptop", "remote"],
+    "remote": ["cell phone", "keyboard"],
+    "cell phone": ["remote"],
+    "mouse": ["remote"],
+    "microwave": ["oven", "tv"],
+    "oven": ["microwave", "refrigerator"],
+    "toaster": ["microwave"],
+    "refrigerator": ["oven"],
+    "sink": ["toilet", "bowl"],
+    "toilet": ["sink", "chair"],
+    "book": ["laptop", "cell phone"],
+    "clock": ["sports ball"],
+    "vase": ["bottle", "cup"],
+    "backpack": ["suitcase", "handbag"],
+    "handbag": ["backpack", "suitcase"],
+    "suitcase": ["backpack", "handbag"],
+    "umbrella": ["kite"],
+    "tie": ["person"],
+    "frisbee": ["sports ball", "kite"],
+    "sports ball": ["frisbee", "apple", "orange"],
+    "kite": ["bird", "umbrella", "frisbee"],
+    "baseball bat": ["tennis racket", "surfboard"],
+    "baseball glove": ["backpack"],
+    "skateboard": ["surfboard", "snowboard"],
+    "surfboard": ["skateboard", "snowboard"],
+    "snowboard": ["skateboard", "surfboard", "skis"],
+    "skis": ["snowboard"],
+    "teddy bear": ["person", "dog"],
+    "hair drier": ["toothbrush"],
+    "toothbrush": ["hair drier"],
+    "person": ["teddy bear"],
+    "train": ["bus", "truck"],
+    "boat": ["surfboard"],
+    "traffic light": ["fire hydrant", "parking meter", "stop sign"],
+    "fire hydrant": ["traffic light", "parking meter"],
+    "stop sign": ["traffic light", "parking meter"],
+    "parking meter": ["fire hydrant", "stop sign"],
+}
+def generate_spurious_annotation(
+    existing_bboxes: List[List[float]], rng: random.Random
+) -> Dict:
+    """Generate a random annotation that doesn't overlap much with existing ones."""
+    for _ in range(20):  # try up to 20 times
+        w = rng.uniform(0.05, 0.20)
+        h = rng.uniform(0.05, 0.20)
+        x = rng.uniform(0.0, 1.0 - w)
+        y = rng.uniform(0.0, 1.0 - h)
+        bbox = [round(x, 4), round(y, 4), round(w, 4), round(h, 4)]
+        # Check it doesn't overlap too much with existing
+        from .grader import compute_iou
+        max_iou = max(
+            (compute_iou(bbox, eb) for eb in existing_bboxes), default=0.0
+        )
+        if max_iou < 0.3:
+            cls = rng.choice(ALL_CLASSES)
+            return {"bbox": bbox, "class_label": cls}
+    # Fallback: place it anyway
+    return {
+        "bbox": [round(rng.uniform(0.0, 0.8), 4), round(rng.uniform(0.0, 0.8), 4), 0.1, 0.1],
+        "class_label": rng.choice(ALL_CLASSES),
+    }
+def corrupt_annotations(
+    gold_annotations: List[Dict],
+    difficulty: str,
+    seed: int,
+) -> Tuple[List[Dict], List[str]]:
+    """
+    Corrupt gold annotations conceptually (no geometry shifts) based on difficulty level.
+    Difficulties:
+    - "spurious": Adds 2-4 entirely fake boxes.
+    - "classes": Swaps 30% of class labels (similar and different) + adds some spurious.
+    - "missing": Deletes 15-20% of annotations completely. VLM must FLAG_MISSING.
+    """
+    rng = random.Random(seed)
+    corrupted = copy.deepcopy(gold_annotations)
+    log = []
+    if difficulty == "spurious":
+        # Task 1: Spurious removal only
+        existing_bboxes = [a["bbox"] for a in corrupted]
+        n_spurious = rng.randint(2, 4)
+        next_id = max((a["id"] for a in corrupted), default=0) + 1
+        for i in range(n_spurious):
+            spur = generate_spurious_annotation(existing_bboxes, rng)
+            spur["id"] = next_id + i
+            corrupted.append(spur)
+            existing_bboxes.append(spur["bbox"])
+            log.append(f"Added spurious ann {spur['id']} ({spur['class_label']})")
+    elif difficulty == "classes":
+        # Task 2: Fix Classes
+        corruption_rate = 0.30
+        n_corrupt = max(2, int(len(corrupted) * corruption_rate))
+        indices = list(range(len(corrupted)))
+        rng.shuffle(indices)
+        corrupt_indices = indices[:n_corrupt]
+        for idx in corrupt_indices:
+            action = rng.choice(["wrong_similar_class", "wrong_different_class"])
+            ann = corrupted[idx]
+            old_cls = ann["class_label"]
+            if action == "wrong_similar_class":
+                similar = SIMILAR_CLASSES.get(old_cls, [])
+                if similar:
+                    new_cls = rng.choice(similar)
+                    ann["class_label"] = new_cls
+                    log.append(f"Changed ann {ann['id']} class: {old_cls} → {new_cls} (similar)")
+                else:
+                    candidates = [c for c in ALL_CLASSES if c != old_cls]
+                    ann["class_label"] = rng.choice(candidates)
+                    log.append(f"Changed ann {ann['id']} class: {old_cls} → {ann['class_label']} (fallback)")
+            elif action == "wrong_different_class":
+                candidates = [c for c in ALL_CLASSES if c != old_cls]
+                ann["class_label"] = rng.choice(candidates)
+                log.append(f"Changed ann {ann['id']} class: {old_cls} → {ann['class_label']} (different)")
+        # Add 1-2 spurious just to keep them on their toes
+        existing_bboxes = [a["bbox"] for a in corrupted]
+        n_spurious = rng.randint(1, 2)
+        next_id = max((a["id"] for a in corrupted), default=0) + 1
+        for i in range(n_spurious):
+            spur = generate_spurious_annotation(existing_bboxes, rng)
+            spur["id"] = next_id + i
+            corrupted.append(spur)
+            existing_bboxes.append(spur["bbox"])
+            log.append(f"Added spurious ann {spur['id']} ({spur['class_label']})")
+    elif difficulty == "missing":
+        # Task 3: Missing items evaluation
+        # Randomly delete 15-20% of annotations completely
+        delete_rate = rng.uniform(0.15, 0.20)
+        n_delete = max(1, int(len(corrupted) * delete_rate))
+        indices = list(range(len(corrupted)))
+        rng.shuffle(indices)
+        delete_indices = indices[:n_delete]
+        for idx in delete_indices:
+            ann = corrupted[idx]
+            log.append(f"Missing Obj Created: Removed ann {ann['id']} ({ann['class_label']})")
+            corrupted[idx] = None
+        corrupted = [a for a in corrupted if a is not None]
+        # Also add a little bit of class confusion
+        corruption_rate = 0.20
+        n_corrupt = max(1, int(len(corrupted) * corruption_rate))
+        remaining_indices = list(range(len(corrupted)))
+        rng.shuffle(remaining_indices)
+        for idx in remaining_indices[:n_corrupt]:
+            ann = corrupted[idx]
+            old_cls = ann["class_label"]
+            candidates = [c for c in ALL_CLASSES if c != old_cls]
+            ann["class_label"] = rng.choice(candidates)
+            log.append(f"Changed class: {old_cls} -> {ann['class_label']}")
+    return corrupted, log

server/environment.py ADDED Viewed

	@@ -0,0 +1,499 @@

+"""
+Annotation QA Environment — Core Environment Logic.
+Implements the OpenEnv 3-method interface:
+- reset(task_id) → Observation
+- step(action) → Observation
+- state → State
+The agent reviews intentionally-flawed annotations on real COCO val2017 images
+and must correct bounding boxes, fix class labels, add missing annotations,
+or remove spurious ones. Dense reward is provided at every step.
+"""
+import copy
+import json
+import os
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from uuid import uuid4
+try:
+    from openenv.core.env_server.types import Action, Observation, State
+except ImportError:
+    # Fallback for standalone
+    pass
+try:
+    from ..models import (
+        Annotation,
+        AnnotationQAAction,
+        AnnotationQAObservation,
+        AnnotationQAState,
+    )
+except ImportError:
+    from models import (
+        Annotation,
+        AnnotationQAAction,
+        AnnotationQAObservation,
+        AnnotationQAState,
+    )
+from .corruption import ALL_CLASSES, corrupt_annotations
+from .grader import (
+    compute_annotation_quality,
+    compute_step_reward,
+    grade_episode,
+)
+# ──────────────────────────────────────────────
+# Task definitions
+# ──────────────────────────────────────────────
+TASK_CONFIGS = {
+    "remove_spurious": {
+        "description": (
+            "Spurious Box Removal Task. Fake bounding boxes have been randomly drawn. "
+            "Identify and remove any annotations that do not strictly bound a real object."
+        ),
+        "difficulty": "spurious",
+        "max_steps": 15,
+        "data_file": "task1_remove_spurious/samples.json",
+    },
+    "fix_classes": {
+        "description": (
+            "Class Identification Task. Some bounding boxes have incorrect class labels, "
+            "and some are completely fake (spurious). Fix class labels using "
+            "CHANGE_CLASS and REMOVE spurious labels."
+        ),
+        "difficulty": "classes",
+        "max_steps": 20,
+        "data_file": "task2_fix_classes/samples.json",
+    },
+    "find_missing": {
+        "description": (
+            "Contextual Object Detection Task. Bounding boxes for key objects have been "
+            "entirely removed from the image. You must meticulously identify what object classes "
+            "are completely missing from the drawn bounding boxes and flag them."
+        ),
+        "difficulty": "missing",
+        "max_steps": 30,
+        "data_file": "task3_find_missing/samples.json",
+    },
+}
+class AnnotationQAEnvironment:
+    """
+    Annotation QA Environment following the OpenEnv pattern.
+    The agent reviews real COCO val2017 image annotations that contain
+    intentional errors and must correct them through a series of actions.
+    A VLM is used to visually inspect the images.
+    """
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(self):
+        self._state = AnnotationQAState()
+        self._gold_annotations: List[Dict] = []
+        self._initial_annotations: List[Dict] = []
+        self._current_annotations: List[Dict] = []
+        self._scene_data: Dict[str, Any] = {}
+        self._task_config: Dict[str, Any] = {}
+        self._corrections_made: int = 0
+        self._done: bool = False
+        self._data_cache: Dict[str, Any] = {}
+        self._next_ann_id: int = 0
+        # Load data directory
+        self._data_dir = Path(__file__).parent.parent / "data" / "tasks"
+    def _load_task_data(self, task_id: str) -> List[Dict]:
+        """Load and cache task data from disk."""
+        if task_id in self._data_cache:
+            return self._data_cache[task_id]
+        config = TASK_CONFIGS[task_id]
+        data_file = self._data_dir / config["data_file"]
+        if not data_file.exists():
+            raise FileNotFoundError(
+                f"Task data file not found: {data_file}. "
+                f"Run 'python -m data.prepare_coco' to generate the COCO dataset."
+            )
+        with open(data_file, "r") as f:
+            data = json.load(f)
+        self._data_cache[task_id] = data
+        return data
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        task: Optional[str] = None,
+        **kwargs: Any,
+    ) -> AnnotationQAObservation:
+        """
+        Start a new episode.
+        Args:
+            seed: Random seed for reproducibility
+            episode_id: Optional episode ID
+            task: Task ID — one of "fix_bboxes", "fix_classes", "batch_audit"
+        """
+        task_id = task or kwargs.get("task_id", "remove_spurious")
+        if task_id not in TASK_CONFIGS:
+            task_id = "remove_spurious"
+        self._task_config = TASK_CONFIGS[task_id]
+        data = self._load_task_data(task_id)
+        # Select a random sample
+        rng = random.Random(seed) if seed is not None else random.Random()
+        scene = rng.choice(data)
+        sample_seed = scene.get("seed", rng.randint(0, 99999))
+        # Store gold annotations
+        self._gold_annotations = copy.deepcopy(scene["gold_annotations"])
+        self._scene_data = scene
+        # Create corrupted annotations
+        corrupted, corruption_log = corrupt_annotations(
+            self._gold_annotations,
+            self._task_config["difficulty"],
+            sample_seed,
+        )
+        self._initial_annotations = copy.deepcopy(corrupted)
+        self._current_annotations = copy.deepcopy(corrupted)
+        self._corrections_made = 0
+        self._done = False
+        # Track next annotation ID
+        self._next_ann_id = max((a["id"] for a in self._current_annotations), default=-1) + 1
+        # Compute initial quality
+        initial_quality = compute_annotation_quality(
+            self._initial_annotations, self._gold_annotations
+        )
+        self._state = AnnotationQAState(
+            episode_id=episode_id or str(uuid4()),
+            step_count=0,
+            task_id=task_id,
+            sample_id=scene.get("scene_id", "unknown"),
+            initial_quality=round(initial_quality, 4),
+            current_quality=round(initial_quality, 4),
+            corrections_made=0,
+        )
+        return self._build_observation(
+            reward=None,
+            message=(
+                f"Review the annotations for this COCO image. "
+                f"There are {len(self._current_annotations)} annotations. "
+                f"Some may have incorrect bounding boxes, wrong class labels, "
+                f"or be entirely spurious. Some objects may be missing annotations. "
+                f"You have {self._task_config['max_steps']} steps to fix them."
+            ),
+        )
+    def step(
+        self,
+        action: AnnotationQAAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> AnnotationQAObservation:
+        """Execute a correction action and return updated observation with reward."""
+        if self._done:
+            return self._build_observation(
+                reward=0.0,
+                message="Episode is already done. Call reset() to start a new episode.",
+            )
+        self._state.step_count += 1
+        error_msg = None
+        # Save pre-action state for reward computation
+        old_annotations = copy.deepcopy(self._current_annotations)
+        # Process action
+        try:
+            if action.action_type == "adjust_bbox":
+                error_msg = self._handle_adjust_bbox(action)
+            elif action.action_type == "change_class":
+                error_msg = self._handle_change_class(action)
+            elif action.action_type == "add_annotation":
+                error_msg = self._handle_add_annotation(action)
+            elif action.action_type == "remove_annotation":
+                error_msg = self._handle_remove_annotation(action)
+            elif action.action_type == "submit":
+                return self._handle_submit()
+            elif action.action_type == "flag_safety":
+                error_msg = self._handle_flag_safety(action)
+            elif action.action_type == "change_attribute":
+                error_msg = self._handle_change_attribute(action)
+            elif action.action_type == "flag_missing":
+                error_msg = self._handle_flag_missing(action)
+            else:
+                error_msg = f"Unknown action_type: {action.action_type}"
+        except Exception as e:
+            error_msg = f"Error processing action: {str(e)}"
+        if error_msg is None:
+            self._corrections_made += 1
+            self._state.corrections_made = self._corrections_made
+        # Compute reward
+        if action.action_type == "flag_safety" and not error_msg:
+            reward = 0.20
+        elif action.action_type == "change_attribute" and not error_msg:
+            reward = 0.15
+        elif action.action_type == "flag_missing" and not error_msg:
+            reward = 0.25
+        else:
+            reward = compute_step_reward(
+                old_annotations,
+                self._current_annotations,
+                self._gold_annotations,
+                action.action_type,
+            )
+        # Update quality tracking
+        current_quality = compute_annotation_quality(
+            self._current_annotations, self._gold_annotations
+        )
+        self._state.current_quality = round(current_quality, 4)
+        # Check if max steps reached
+        if self._state.step_count >= self._task_config["max_steps"]:
+            self._done = True
+            final_score = grade_episode(
+                self._initial_annotations,
+                self._current_annotations,
+                self._gold_annotations,
+            )
+            return self._build_observation(
+                reward=final_score,
+                message=f"Max steps reached. Final score: {final_score:.3f}",
+                error=error_msg,
+            )
+        return self._build_observation(
+            reward=reward,
+            message=(
+                f"{'Error: ' + error_msg if error_msg else 'Correction applied.'} "
+                f"Quality: {current_quality:.3f} "
+                f"(was {self._state.initial_quality:.3f}). "
+                f"Steps remaining: {self._task_config['max_steps'] - self._state.step_count}"
+            ),
+            error=error_msg,
+        )
+    @property
+    def state(self) -> AnnotationQAState:
+        """Get current episode state."""
+        return self._state
+    def close(self) -> None:
+        """Clean up environment resources."""
+        pass
+    async def reset_async(self, **kwargs) -> AnnotationQAObservation:
+        """Async wrapper for reset (required by OpenEnv server interface)."""
+        return self.reset(**kwargs)
+    async def step_async(self, action: AnnotationQAAction, **kwargs) -> AnnotationQAObservation:
+        """Async wrapper for step (required by OpenEnv server interface)."""
+        return self.step(action, **kwargs)
+    # ──────────────────────────────────────────
+    # Action handlers
+    # ──────────────────────────────────────────
+    def _handle_adjust_bbox(self, action: AnnotationQAAction) -> Optional[str]:
+        """Adjust the bounding box of an existing annotation."""
+        if action.annotation_id is None:
+            return "annotation_id is required for adjust_bbox"
+        if action.new_bbox is None:
+            return "new_bbox is required for adjust_bbox"
+        if len(action.new_bbox) != 4:
+            return "new_bbox must have exactly 4 values [x, y, w, h]"
+        ann = self._find_annotation(action.annotation_id)
+        if ann is None:
+            return f"Annotation {action.annotation_id} not found"
+        # Validate bbox values
+        for v in action.new_bbox:
+            if not (0.0 <= v <= 1.0):
+                return "All bbox values must be between 0.0 and 1.0"
+        ann["bbox"] = [round(v, 4) for v in action.new_bbox]
+        return None
+    def _handle_change_class(self, action: AnnotationQAAction) -> Optional[str]:
+        """Change the class label of an existing annotation."""
+        if action.annotation_id is None:
+            return "annotation_id is required for change_class"
+        if action.new_class is None:
+            return "new_class is required for change_class"
+        if action.new_class not in ALL_CLASSES:
+            return f"Invalid class '{action.new_class}'. Valid: {ALL_CLASSES}"
+        ann = self._find_annotation(action.annotation_id)
+        if ann is None:
+            return f"Annotation {action.annotation_id} not found"
+        ann["class_label"] = action.new_class
+        return None
+    def _handle_add_annotation(self, action: AnnotationQAAction) -> Optional[str]:
+        """Add a new annotation."""
+        if action.new_bbox is None:
+            return "new_bbox is required for add_annotation"
+        if action.new_class is None:
+            return "new_class is required for add_annotation"
+        if len(action.new_bbox) != 4:
+            return "new_bbox must have exactly 4 values [x, y, w, h]"
+        if action.new_class not in ALL_CLASSES:
+            return f"Invalid class '{action.new_class}'. Valid: {ALL_CLASSES}"
+        for v in action.new_bbox:
+            if not (0.0 <= v <= 1.0):
+                return "All bbox values must be between 0.0 and 1.0"
+        new_ann = {
+            "id": self._next_ann_id,
+            "bbox": [round(v, 4) for v in action.new_bbox],
+            "class_label": action.new_class,
+        }
+        self._current_annotations.append(new_ann)
+        self._next_ann_id += 1
+        return None
+    def _handle_remove_annotation(self, action: AnnotationQAAction) -> Optional[str]:
+        """Remove an annotation."""
+        if action.annotation_id is None:
+            return "annotation_id is required for remove_annotation"
+        idx = self._find_annotation_index(action.annotation_id)
+        if idx is None:
+            return f"Annotation {action.annotation_id} not found"
+        self._current_annotations.pop(idx)
+        return None
+    def _handle_submit(self) -> AnnotationQAObservation:
+        """Submit corrections and compute final grade."""
+        self._done = True
+        final_score = grade_episode(
+            self._initial_annotations,
+            self._current_annotations,
+            self._gold_annotations,
+        )
+        return self._build_observation(
+            reward=final_score,
+            message=(
+                f"Corrections submitted! "
+                f"Final score: {final_score:.3f}. "
+                f"Quality went from {self._state.initial_quality:.3f} "
+                f"to {self._state.current_quality:.3f} over "
+                f"{self._state.step_count} steps."
+            ),
+        )
+    def _handle_flag_safety(self, action: AnnotationQAAction) -> Optional[str]:
+        if action.annotation_id is None:
+            return "annotation_id is required for flag_safety"
+        ann = self._find_annotation(action.annotation_id)
+        if ann is None: return "Annotation not found"
+        # We don't change state, just append tracking metadata for the grader
+        ann["safety_flagged"] = True
+        return None
+    def _handle_change_attribute(self, action: AnnotationQAAction) -> Optional[str]:
+        if action.annotation_id is None:
+            return "annotation_id is required for change_attribute"
+        if not action.new_attribute:
+            return "new_attribute is required"
+        ann = self._find_annotation(action.annotation_id)
+        if ann is None: return "Annotation not found"
+        ann["class_label"] = action.new_attribute
+        return None
+    def _handle_flag_missing(self, action: AnnotationQAAction) -> Optional[str]:
+        if not action.missing_class:
+            return "missing_class is required for flag_missing"
+        # Flagging missing class adds a placeholder marker
+        self._current_annotations.append({
+            "id": self._next_ann_id,
+            "bbox": [0,0,0,0],
+            "class_label": f"missing_{action.missing_class}"
+        })
+        self._next_ann_id += 1
+        return None
+    # ──────────────────────────────────────────
+    # Helpers
+    # ──────────────────────────────────────────
+    def _find_annotation(self, ann_id: int) -> Optional[Dict]:
+        for ann in self._current_annotations:
+            if ann["id"] == ann_id:
+                return ann
+        return None
+    def _find_annotation_index(self, ann_id: int) -> Optional[int]:
+        for i, ann in enumerate(self._current_annotations):
+            if ann["id"] == ann_id:
+                return i
+        return None
+    def _build_observation(
+        self,
+        reward: Optional[float],
+        message: str,
+        error: Optional[str] = None,
+    ) -> AnnotationQAObservation:
+        """Build an observation from current state."""
+        return AnnotationQAObservation(
+            done=self._done,
+            reward=reward,
+            # Image info from COCO
+            image_url=self._scene_data.get("image_url"),
+            image_width=self._scene_data.get("image_width", 0),
+            image_height=self._scene_data.get("image_height", 0),
+            # Scene info
+            scene_description=self._scene_data.get("scene_description", ""),
+            scene_objects=[
+                {
+                    "id": obj["id"],
+                    "class_label": obj["class_label"],
+                    "position": obj.get("position", ""),
+                    "bbox": obj["bbox"],
+                }
+                for obj in self._scene_data.get("objects", [])
+            ],
+            annotations=[
+                Annotation(
+                    id=ann["id"],
+                    bbox=ann["bbox"],
+                    class_label=ann["class_label"],
+                )
+                for ann in self._current_annotations
+            ],
+            available_classes=ALL_CLASSES,
+            task_id=self._state.task_id,
+            task_description=self._task_config.get("description", ""),
+            corrections_made=self._corrections_made,
+            step_count=self._state.step_count,
+            max_steps=self._task_config.get("max_steps", 20),
+            message=message,
+            last_action_error=error,
+        )

server/grader.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Grading utilities for the Annotation QA Environment.
+Provides deterministic scoring (0.0-1.0) based on:
+- IoU (Intersection over Union) of bounding boxes
+- Class label accuracy
+- Precision (penalizes spurious annotations)
+- Recall (penalizes missed annotations)
+Uses Hungarian matching to optimally pair predicted vs gold annotations.
+"""
+from typing import Dict, List, Tuple
+def compute_iou(box_a: List[float], box_b: List[float]) -> float:
+    """
+    Compute Intersection over Union between two boxes.
+    Boxes are [x, y, w, h] with values in 0.0–1.0.
+    """
+    ax, ay, aw, ah = box_a
+    bx, by, bw, bh = box_b
+    # Convert to (x1, y1, x2, y2)
+    a_x1, a_y1, a_x2, a_y2 = ax, ay, ax + aw, ay + ah
+    b_x1, b_y1, b_x2, b_y2 = bx, by, bx + bw, by + bh
+    # Intersection
+    inter_x1 = max(a_x1, b_x1)
+    inter_y1 = max(a_y1, b_y1)
+    inter_x2 = min(a_x2, b_x2)
+    inter_y2 = min(a_y2, b_y2)
+    inter_w = max(0, inter_x2 - inter_x1)
+    inter_h = max(0, inter_y2 - inter_y1)
+    inter_area = inter_w * inter_h
+    # Union
+    area_a = aw * ah
+    area_b = bw * bh
+    union_area = area_a + area_b - inter_area
+    if union_area < 1e-8:
+        return 0.0
+    return inter_area / union_area
+def compute_annotation_quality(
+    annotations: List[Dict],
+    gold_annotations: List[Dict],
+) -> float:
+    """
+    Compute specific Semantic VLM visual QA testing metrics (0.0-1.0).
+    Graded on:
+    - Spurious Precision (35%): Did you remove fake boxes without destroying real ones?
+    - Class Match Accuracy (35%): For existing valid boxes, did you change to the correct Gold label?
+    - Missing Flag Recall (30%): Did you successfully use FLAG_MISSING for objects removed from the image?
+    """
+    from collections import Counter
+    if not gold_annotations:
+        return 1.0 if not annotations else 0.5
+    # 1. Spurious Precision
+    gold_map = {a["id"]: a for a in gold_annotations}
+    predictions_valid = [a for a in annotations if not a.get("class_label", "").startswith("missing_")]
+    if not predictions_valid:
+        precision = 0.0
+    else:
+        precision = sum(1 for a in predictions_valid if a["id"] in gold_map) / len(predictions_valid)
+    # 2. Class Match Accuracy for valid boxes
+    matched = [a for a in predictions_valid if a["id"] in gold_map]
+    if not matched:
+        class_acc = 0.0
+    else:
+        class_acc = sum(1 for a in matched if a.get("class_label", "") == gold_map[a["id"]].get("class_label", "")) / len(matched)
+    # 3. Missing Object Flag Recall
+    expected_classes = [g.get("class_label", "") for g in gold_annotations]
+    present_classes = [a.get("class_label", "") for a in annotations if a["id"] in gold_map and not a.get("class_label", "").startswith("missing_")]
+    # Calculate exact missing instances mathematically
+    exp_counts = Counter(expected_classes)
+    pres_counts = Counter(present_classes)
+    actual_missing_classes = []
+    for cls, count in exp_counts.items():
+        if count > pres_counts.get(cls, 0):
+            for _ in range(count - pres_counts.get(cls, 0)):
+                actual_missing_classes.append(cls)
+    if not actual_missing_classes:
+        missing_acc = 1.0
+    else:
+        flagged_classes = [a.get("class_label", "").replace("missing_", "", 1) for a in annotations if a.get("class_label", "").startswith("missing_")]
+        flagged_counts = Counter(flagged_classes)
+        caught = 0
+        for cls in actual_missing_classes:
+            if flagged_counts.get(cls, 0) > 0:
+                caught += 1
+                flagged_counts[cls] -= 1
+        missing_acc = caught / len(actual_missing_classes)
+    quality = 0.35 * class_acc + 0.35 * precision + 0.30 * missing_acc
+    return max(0.0, min(1.0, quality))
+def grade_episode(
+    initial_annotations: List[Dict],
+    final_annotations: List[Dict],
+    gold_annotations: List[Dict],
+) -> float:
+    """
+    Compute the episode grade (0.0–1.0).
+    """
+    initial_quality = compute_annotation_quality(initial_annotations, gold_annotations)
+    final_quality = compute_annotation_quality(final_annotations, gold_annotations)
+    max_improvement = 1.0 - initial_quality
+    if max_improvement < 0.01:
+        return 1.0 if final_quality >= initial_quality - 0.01 else 0.5
+    improvement = final_quality - initial_quality
+    score = improvement / max_improvement
+    return max(0.0, min(1.0, score))
+def compute_step_reward(
+    old_annotations: List[Dict],
+    new_annotations: List[Dict],
+    gold_annotations: List[Dict],
+    action_type: str,
+) -> float:
+    """
+    Compute dense per-step reward based on quality delta.
+    """
+    old_quality = compute_annotation_quality(old_annotations, gold_annotations)
+    new_quality = compute_annotation_quality(new_annotations, gold_annotations)
+    delta = new_quality - old_quality
+    reward = delta * 2.0  # quality improvement → reward
+    reward -= 0.01  # step penalty
+    if action_type == "submit":
+        reward += 0.05
+    return round(reward, 4)