pitvqa-training-scripts / pitvqa_agent_orchestrator.py

Upload pitvqa_agent_orchestrator.py with huggingface_hub

e4a4f4c verified 3 months ago

32.2 kB

	#!/usr/bin/env python3
	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "huggingface_hub>=0.21.0",
	# "requests",
	# ]
	# ///
	"""
	PitVQA Multi-Agent Orchestration System

	Specialized agents for methodologically rigorous VLM pipeline management:
	1. JobMonitorAgent - Track HuggingFace Jobs status
	2. CurationAgent - Quality-filter showcase examples
	3. DatasetAgent - Validate image-embedded dataset
	4. ModelVerifierAgent - Test merged model outputs
	5. DemoSyncAgent - Update Gradio Space with results

	Run with: python pitvqa_agent_orchestrator.py
	"""

	import os
	import json
	import time
	from dataclasses import dataclass
	from typing import Dict, List, Optional, Any
	from datetime import datetime
	from enum import Enum

	# ============================================================
	# Agent Status Types
	# ============================================================

	class AgentStatus(Enum):
	IDLE = "idle"
	RUNNING = "running"
	SUCCESS = "success"
	FAILED = "failed"
	WAITING = "waiting"

	@dataclass
	class AgentResult:
	agent_name: str
	status: AgentStatus
	message: str
	data: Optional[Dict] = None
	timestamp: str = ""

	def __post_init__(self):
	if not self.timestamp:
	self.timestamp = datetime.now().isoformat()

	# ============================================================
	# Base Agent
	# ============================================================

	class BaseAgent:
	"""Base class for all PitVQA agents."""

	def __init__(self, name: str):
	self.name = name
	self.status = AgentStatus.IDLE
	self.results: List[AgentResult] = []

	def log(self, message: str, level: str = "INFO"):
	icon = {"INFO": "ℹ️", "SUCCESS": "✅", "ERROR": "❌", "WARN": "⚠️"}.get(level, "📌")
	print(f"[{self.name}] {icon} {message}")

	def run(self) -> AgentResult:
	raise NotImplementedError

	def report(self) -> Dict:
	return {
	"agent": self.name,
	"status": self.status.value,
	"results": [r.__dict__ for r in self.results]
	}

	# ============================================================
	# Agent 1: Job Monitor
	# ============================================================

	class JobMonitorAgent(BaseAgent):
	"""Monitors HuggingFace Jobs and reports status."""

	def __init__(self, job_ids: List[str]):
	super().__init__("JobMonitor")
	self.job_ids = job_ids
	self.job_status = {}

	def check_job(self, job_id: str) -> Dict:
	"""Check single job status using HF API."""
	try:
	from huggingface_hub import HfApi
	api = HfApi()

	# Get job info
	job = api.get_job(job_id)
	return {
	"id": job_id,
	"status": job.status.stage if hasattr(job.status, 'stage') else str(job.status),
	"message": job.status.message if hasattr(job.status, 'message') else None
	}
	except Exception as e:
	return {"id": job_id, "status": "UNKNOWN", "error": str(e)}

	def run(self) -> AgentResult:
	self.status = AgentStatus.RUNNING
	self.log(f"Checking {len(self.job_ids)} jobs...")

	all_complete = True
	any_failed = False

	for job_id in self.job_ids:
	status = self.check_job(job_id)
	self.job_status[job_id] = status

	stage = status.get("status", "UNKNOWN")
	self.log(f"Job {job_id[:8]}: {stage}")

	if stage not in ["COMPLETED", "SUCCESS"]:
	all_complete = False
	if stage in ["FAILED", "ERROR"]:
	any_failed = True

	if any_failed:
	self.status = AgentStatus.FAILED
	return AgentResult(self.name, AgentStatus.FAILED, "Some jobs failed", self.job_status)
	elif all_complete:
	self.status = AgentStatus.SUCCESS
	return AgentResult(self.name, AgentStatus.SUCCESS, "All jobs complete", self.job_status)
	else:
	self.status = AgentStatus.WAITING
	return AgentResult(self.name, AgentStatus.WAITING, "Jobs still running", self.job_status)

	# ============================================================
	# Agent 2: Curation Agent
	# ============================================================

	class CurationAgent(BaseAgent):
	"""Curates showcase examples based on quality criteria."""

	QUALITY_CRITERIA = {
	"coordinate_validity": lambda x, y: 0 <= x <= 100 and 0 <= y <= 100,
	"coordinate_diversity": lambda coords: len(set(coords)) > len(coords) * 0.5,
	"video_diversity": lambda vids: len(set(vids)) >= min(5, len(vids)),
	"frame_diversity": lambda frames: len(set(frames)) >= min(8, len(frames)),
	}

	def __init__(self, results_path: str = "./curation_review/all_results.json"):
	super().__init__("Curation")
	self.results_path = results_path
	self.curated_examples = []

	def load_results(self) -> List[Dict]:
	"""Load raw curation results."""
	try:
	with open(self.results_path) as f:
	return json.load(f)
	except FileNotFoundError:
	self.log("Results file not found - job may still be running", "WARN")
	return []

	def score_example(self, example: Dict) -> float:
	"""Score a single example (0-1)."""
	score = 0.0

	# Basic validity
	if example.get("success"):
	score += 0.3

	# Coordinate quality
	if example.get("task") == "point":
	x, y = example.get("x"), example.get("y")
	if x and y:
	# Penalize edge coordinates (likely failures)
	if 10 < x < 90 and 10 < y < 90:
	score += 0.3
	else:
	score += 0.1
	elif example.get("task") == "bbox":
	bbox = example.get("bbox")
	if bbox and len(bbox) == 4:
	# Penalize tiny or huge boxes
	area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
	if 100 < area < 5000:
	score += 0.3
	else:
	score += 0.1

	# Response coherence
	response = example.get("response", "")
	if "<point" in response or "<box" in response:
	score += 0.2

	# Target relevance
	target = example.get("target", "")
	if target in response.lower():
	score += 0.2

	return min(score, 1.0)

	def curate(self, results: List[Dict], top_k: int = 12) -> List[Dict]:
	"""Select best diverse examples."""
	if not results:
	return []

	# Score all examples
	scored = [(self.score_example(ex), ex) for ex in results if ex.get("success")]
	scored.sort(key=lambda x: x[0], reverse=True)

	# Ensure diversity
	curated = []
	used_videos = set()
	used_frames = set()
	used_tasks = {"point": 0, "bbox": 0}

	for score, ex in scored:
	if len(curated) >= top_k:
	break

	video = ex.get("video_id")
	frame = ex.get("frame_idx")
	task = ex.get("task")

	# Diversity constraints
	if used_videos.count(video) >= 2: # Max 2 per video
	continue
	if (video, frame) in used_frames: # Unique video+frame combos
	continue
	if used_tasks.get(task, 0) >= top_k // 2: # Balance tasks
	continue

	curated.append({**ex, "quality_score": score})
	used_videos.add(video)
	used_frames.add((video, frame))
	used_tasks[task] = used_tasks.get(task, 0) + 1

	return curated

	def run(self) -> AgentResult:
	self.status = AgentStatus.RUNNING
	self.log("Loading curation results...")

	results = self.load_results()
	if not results:
	self.status = AgentStatus.WAITING
	return AgentResult(self.name, AgentStatus.WAITING, "No results available yet")

	self.log(f"Scoring {len(results)} examples...")
	self.curated_examples = self.curate(results)

	if len(self.curated_examples) >= 8:
	self.status = AgentStatus.SUCCESS

	# Report diversity
	videos = set(ex["video_id"] for ex in self.curated_examples)
	frames = set(ex["frame_idx"] for ex in self.curated_examples)

	self.log(f"Curated {len(self.curated_examples)} examples", "SUCCESS")
	self.log(f" Videos: {len(videos)} unique")
	self.log(f" Frames: {len(frames)} unique")

	return AgentResult(
	self.name,
	AgentStatus.SUCCESS,
	f"Curated {len(self.curated_examples)} high-quality diverse examples",
	{"examples": self.curated_examples}
	)
	else:
	self.status = AgentStatus.FAILED
	return AgentResult(
	self.name,
	AgentStatus.FAILED,
	f"Only {len(self.curated_examples)} examples passed quality checks"
	)

	# ============================================================
	# Agent 3: Dataset Validator
	# ============================================================

	class DatasetValidatorAgent(BaseAgent):
	"""Validates image-embedded dataset quality."""

	def __init__(self, dataset_id: str = "mmrech/pitvqa-spatial-with-images"):
	super().__init__("DatasetValidator")
	self.dataset_id = dataset_id

	def run(self) -> AgentResult:
	self.status = AgentStatus.RUNNING
	self.log(f"Validating dataset: {self.dataset_id}")

	try:
	from datasets import load_dataset

	# Try to load dataset
	ds = load_dataset(self.dataset_id, split="train[:10]")

	# Check required fields
	required_fields = ["image", "messages"]
	missing = [f for f in required_fields if f not in ds.features]

	if missing:
	self.status = AgentStatus.FAILED
	return AgentResult(
	self.name,
	AgentStatus.FAILED,
	f"Missing fields: {missing}"
	)

	# Validate image quality
	valid_images = 0
	for ex in ds:
	img = ex.get("image")
	if img and hasattr(img, "size") and img.size[0] > 0:
	valid_images += 1

	if valid_images == len(ds):
	self.status = AgentStatus.SUCCESS
	return AgentResult(
	self.name,
	AgentStatus.SUCCESS,
	f"Dataset valid: {valid_images}/{len(ds)} images OK",
	{"sample_count": len(ds), "valid_images": valid_images}
	)
	else:
	self.status = AgentStatus.FAILED
	return AgentResult(
	self.name,
	AgentStatus.FAILED,
	f"Invalid images: {len(ds) - valid_images}/{len(ds)}"
	)

	except Exception as e:
	self.status = AgentStatus.WAITING
	return AgentResult(
	self.name,
	AgentStatus.WAITING,
	f"Dataset not yet available: {e}"
	)

	# ============================================================
	# Agent 4: Model Verifier
	# ============================================================

	class ModelVerifierAgent(BaseAgent):
	"""Verifies merged model outputs are correct."""

	TEST_PROMPTS = [
	("Point to the suction device", "point"),
	("Draw a bounding box around the surgical instrument", "bbox"),
	("What surgical phase is this?", "classification"),
	]

	def __init__(self, model_id: str = "mmrech/pitvqa-qwen2vl-merged"):
	super().__init__("ModelVerifier")
	self.model_id = model_id

	def run(self) -> AgentResult:
	self.status = AgentStatus.RUNNING
	self.log(f"Verifying model: {self.model_id}")

	try:
	from huggingface_hub import HfApi
	api = HfApi()

	# Check if model exists
	try:
	info = api.model_info(self.model_id)
	self.log(f"Model found: {info.modelId}")

	# Check for required files
	files = [f.rfilename for f in info.siblings]
	required = ["config.json", "model.safetensors"]

	# Check if main model files exist
	has_model = any("safetensors" in f or "pytorch" in f for f in files)
	has_config = "config.json" in files

	if has_model and has_config:
	self.status = AgentStatus.SUCCESS
	return AgentResult(
	self.name,
	AgentStatus.SUCCESS,
	f"Model verified: {len(files)} files present",
	{"files": files[:10]} # First 10 files
	)
	else:
	self.status = AgentStatus.FAILED
	return AgentResult(
	self.name,
	AgentStatus.FAILED,
	f"Missing model files (has_model={has_model}, has_config={has_config})"
	)

	except Exception as e:
	self.status = AgentStatus.WAITING
	return AgentResult(
	self.name,
	AgentStatus.WAITING,
	f"Model not yet available: {e}"
	)

	except Exception as e:
	self.status = AgentStatus.FAILED
	return AgentResult(self.name, AgentStatus.FAILED, f"Error: {e}")

	# ============================================================
	# Agent 5: Training Specialist (HF-LLM-Trainer)
	# ============================================================

	class TrainingSpecialistAgent(BaseAgent):
	"""
	Specialist in HuggingFace LLM Training (TRL/SFT/LoRA/DPO).

	Responsibilities:
	- Validate training configurations
	- Check adapter quality
	- Recommend training improvements
	- Verify LoRA/PEFT setup
	"""

	TRAINING_METHODS = {
	"SFT": "Supervised Fine-Tuning - learning from (input, output) pairs",
	"LoRA": "Low-Rank Adaptation - parameter-efficient adapters",
	"DPO": "Direct Preference Optimization - learning from preferences",
	"RLHF": "Reinforcement Learning from Human Feedback",
	}

	OPTIMAL_CONFIG = {
	"lora_r": 16,
	"lora_alpha": 32,
	"learning_rate": 1e-4,
	"batch_size": 1,
	"gradient_accumulation_steps": 16,
	"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
	}

	def __init__(self, adapter_repo: str = "mmrech/pitvqa-qwen2vl-unified-v2"):
	super().__init__("TrainingSpecialist")
	self.adapter_repo = adapter_repo

	def validate_adapter_config(self) -> Dict:
	"""Validate adapter configuration."""
	try:
	from huggingface_hub import hf_hub_download
	import json

	# Download adapter config
	config_path = hf_hub_download(
	repo_id=self.adapter_repo,
	filename="stage4/adapter_config.json"
	)

	with open(config_path) as f:
	config = json.load(f)

	# Check key parameters
	issues = []
	recommendations = []

	# Check LoRA rank
	if config.get("r", 0) < 8:
	issues.append("LoRA rank too low (r < 8)")
	elif config.get("r", 0) > 64:
	recommendations.append("Consider reducing LoRA rank for efficiency")

	# Check target modules
	target_modules = config.get("target_modules", [])
	if not any("proj" in m for m in target_modules):
	issues.append("No projection layers targeted")

	return {
	"config": config,
	"issues": issues,
	"recommendations": recommendations,
	"valid": len(issues) == 0
	}

	except Exception as e:
	return {"error": str(e), "valid": False}

	def recommend_next_training(self, current_metrics: Dict = None) -> Dict:
	"""Recommend next training steps based on current metrics."""
	recommendations = []

	if not current_metrics:
	recommendations.append({
	"priority": "HIGH",
	"action": "Run evaluation to get baseline metrics",
	"method": "scripts/evaluate_unified_vlm.py"
	})
	else:
	accuracy = current_metrics.get("accuracy", 0)

	if accuracy < 0.7:
	recommendations.append({
	"priority": "HIGH",
	"action": "Increase training epochs or data",
	"method": "SFT with more epochs"
	})

	if accuracy >= 0.7 and accuracy < 0.85:
	recommendations.append({
	"priority": "MEDIUM",
	"action": "Consider DPO for preference learning",
	"method": "Create chosen/rejected pairs from predictions"
	})

	if accuracy >= 0.85:
	recommendations.append({
	"priority": "LOW",
	"action": "Model performing well - focus on inference optimization",
	"method": "Merge adapters, quantize for deployment"
	})

	return {"recommendations": recommendations}

	def run(self) -> AgentResult:
	self.status = AgentStatus.RUNNING
	self.log(f"Validating training setup: {self.adapter_repo}")

	# Validate adapter
	validation = self.validate_adapter_config()

	if validation.get("valid"):
	self.status = AgentStatus.SUCCESS
	recommendations = self.recommend_next_training()

	return AgentResult(
	self.name,
	AgentStatus.SUCCESS,
	f"Training config valid. LoRA r={validation['config'].get('r')}",
	{
	"config": validation["config"],
	"recommendations": recommendations["recommendations"]
	}
	)
	elif validation.get("error"):
	self.status = AgentStatus.WAITING
	return AgentResult(
	self.name,
	AgentStatus.WAITING,
	f"Could not load adapter: {validation['error']}"
	)
	else:
	self.status = AgentStatus.FAILED
	return AgentResult(
	self.name,
	AgentStatus.FAILED,
	f"Issues found: {validation['issues']}",
	validation
	)

	# ============================================================
	# Agent 6: Evaluation Specialist
	# ============================================================

	class EvaluationSpecialistAgent(BaseAgent):
	"""
	Specialist in Model Evaluation (metrics, benchmarks, validation).

	Responsibilities:
	- Compute accuracy, F1, precision, recall
	- Validate coordinate predictions (MAE, quadrant accuracy)
	- Compare against baselines
	- Generate evaluation reports
	"""

	METRICS = {
	"classification": ["accuracy", "f1", "precision", "recall"],
	"localization": ["mae", "quadrant_accuracy", "distance_error"],
	"detection": ["iou", "ap", "ar"],
	}

	THRESHOLDS = {
	"quadrant_accuracy": 0.75, # Minimum acceptable
	"mae": 15.0, # Maximum acceptable (percentage)
	"classification_accuracy": 0.80,
	}

	def __init__(self, model_repo: str = "mmrech/pitvqa-qwen2vl-unified-v2"):
	super().__init__("EvaluationSpecialist")
	self.model_repo = model_repo
	self.metrics = {}

	def load_evaluation_results(self) -> Dict:
	"""Load existing evaluation results if available."""
	try:
	with open("evaluation_results.json") as f:
	return json.load(f)
	except FileNotFoundError:
	return {}

	def compute_quick_metrics(self, predictions: List[Dict]) -> Dict:
	"""Compute quick metrics from predictions."""
	if not predictions:
	return {}

	metrics = {}

	# Coordinate predictions
	coord_preds = [p for p in predictions if p.get("task") in ["point", "pointing"]]
	if coord_preds:
	valid = [p for p in coord_preds if p.get("x") is not None]
	metrics["valid_rate"] = len(valid) / len(coord_preds)

	# Calculate MAE if ground truth available
	errors = []
	for p in valid:
	if p.get("gt_x") and p.get("gt_y"):
	err = ((p["x"] - p["gt_x"])2 + (p["y"] - p["gt_y"])2)**0.5
	errors.append(err)

	if errors:
	metrics["mae"] = sum(errors) / len(errors)
	metrics["quadrant_accuracy"] = sum(1 for e in errors if e < 25) / len(errors)

	# Classification predictions
	class_preds = [p for p in predictions if p.get("task") == "classification"]
	if class_preds:
	correct = sum(1 for p in class_preds if p.get("prediction") == p.get("ground_truth"))
	metrics["classification_accuracy"] = correct / len(class_preds)

	return metrics

	def evaluate_against_thresholds(self, metrics: Dict) -> Dict:
	"""Check metrics against quality thresholds."""
	results = {"passed": [], "failed": [], "warnings": []}

	for metric, threshold in self.THRESHOLDS.items():
	if metric in metrics:
	value = metrics[metric]
	if metric == "mae":
	passed = value <= threshold
	else:
	passed = value >= threshold

	entry = {"metric": metric, "value": value, "threshold": threshold}
	if passed:
	results["passed"].append(entry)
	else:
	results["failed"].append(entry)

	return results

	def generate_report(self, metrics: Dict, threshold_results: Dict) -> str:
	"""Generate evaluation report."""
	report = []
	report.append("=" * 50)
	report.append("EVALUATION REPORT")
	report.append("=" * 50)

	report.append("\n📊 METRICS:")
	for k, v in metrics.items():
	report.append(f" {k}: {v:.4f}" if isinstance(v, float) else f" {k}: {v}")

	report.append("\n✅ PASSED:")
	for item in threshold_results["passed"]:
	report.append(f" {item['metric']}: {item['value']:.4f} (threshold: {item['threshold']})")

	if threshold_results["failed"]:
	report.append("\n❌ FAILED:")
	for item in threshold_results["failed"]:
	report.append(f" {item['metric']}: {item['value']:.4f} (threshold: {item['threshold']})")

	return "\n".join(report)

	def run(self, predictions: List[Dict] = None) -> AgentResult:
	self.status = AgentStatus.RUNNING
	self.log("Running evaluation...")

	# Try to load existing results
	existing = self.load_evaluation_results()

	if existing:
	self.log("Found existing evaluation results")
	self.metrics = existing
	elif predictions:
	self.log(f"Computing metrics from {len(predictions)} predictions")
	self.metrics = self.compute_quick_metrics(predictions)
	else:
	self.status = AgentStatus.WAITING
	return AgentResult(
	self.name,
	AgentStatus.WAITING,
	"No predictions available for evaluation"
	)

	# Check against thresholds
	threshold_results = self.evaluate_against_thresholds(self.metrics)

	# Generate report
	report = self.generate_report(self.metrics, threshold_results)
	self.log(f"\n{report}")

	if threshold_results["failed"]:
	self.status = AgentStatus.FAILED
	return AgentResult(
	self.name,
	AgentStatus.FAILED,
	f"{len(threshold_results['failed'])} metrics below threshold",
	{"metrics": self.metrics, "thresholds": threshold_results}
	)
	else:
	self.status = AgentStatus.SUCCESS
	return AgentResult(
	self.name,
	AgentStatus.SUCCESS,
	f"All {len(threshold_results['passed'])} metrics passed",
	{"metrics": self.metrics, "thresholds": threshold_results}
	)

	# ============================================================
	# Agent 7: Demo Sync Agent
	# ============================================================

	class DemoSyncAgent(BaseAgent):
	"""Syncs curated examples to Gradio Space."""

	def __init__(self, space_id: str = "mmrech/pitvqa-surgical-vlm"):
	super().__init__("DemoSync")
	self.space_id = space_id

	def run(self, curated_examples: List[Dict] = None) -> AgentResult:
	self.status = AgentStatus.RUNNING
	self.log(f"Syncing to Space: {self.space_id}")

	if not curated_examples:
	self.status = AgentStatus.WAITING
	return AgentResult(
	self.name,
	AgentStatus.WAITING,
	"No curated examples to sync"
	)

	try:
	from huggingface_hub import HfApi
	api = HfApi()

	# Check Space status
	try:
	info = api.space_info(self.space_id)
	runtime = info.runtime

	if runtime and runtime.stage == "RUNNING":
	self.log(f"Space is running", "SUCCESS")

	# Create examples JSON for sync
	examples_json = json.dumps(curated_examples, indent=2)

	self.status = AgentStatus.SUCCESS
	return AgentResult(
	self.name,
	AgentStatus.SUCCESS,
	f"Space running, {len(curated_examples)} examples ready for sync",
	{"space_status": "RUNNING", "examples_count": len(curated_examples)}
	)
	else:
	self.status = AgentStatus.WAITING
	return AgentResult(
	self.name,
	AgentStatus.WAITING,
	f"Space not running: {runtime.stage if runtime else 'unknown'}"
	)

	except Exception as e:
	self.status = AgentStatus.FAILED
	return AgentResult(self.name, AgentStatus.FAILED, f"Space error: {e}")

	except Exception as e:
	self.status = AgentStatus.FAILED
	return AgentResult(self.name, AgentStatus.FAILED, f"Error: {e}")

	# ============================================================
	# Orchestrator
	# ============================================================

	class PitVQAOrchestrator:
	"""Coordinates all agents for the PitVQA pipeline."""

	def __init__(self, job_ids: List[str]):
	self.agents = {
	"monitor": JobMonitorAgent(job_ids),
	"curation": CurationAgent(),
	"dataset": DatasetValidatorAgent(),
	"model": ModelVerifierAgent(),
	"training": TrainingSpecialistAgent(), # HF-LLM-Trainer specialist
	"evaluation": EvaluationSpecialistAgent(), # Eval-Model specialist
	"demo": DemoSyncAgent(),
	}
	self.results = {}
	self.run_count = 0

	def run_cycle(self) -> Dict:
	"""Run one orchestration cycle."""
	self.run_count += 1
	print(f"\n{'='*60}")
	print(f"🔄 ORCHESTRATION CYCLE {self.run_count}")
	print(f"{'='*60}")

	# Phase 1: Check job status
	print("\n📊 Phase 1: Job Monitoring")
	monitor_result = self.agents["monitor"].run()
	self.results["monitor"] = monitor_result

	# Phase 2: Training Specialist - Validate adapter config
	print("\n🎓 Phase 2: Training Validation (HF-LLM-Trainer)")
	training_result = self.agents["training"].run()
	self.results["training"] = training_result

	# Phase 3: If jobs complete, run downstream agents
	if monitor_result.status in [AgentStatus.SUCCESS, AgentStatus.WAITING]:

	# Run curation
	print("\n🎨 Phase 3: Curation")
	curation_result = self.agents["curation"].run()
	self.results["curation"] = curation_result

	# Run dataset validation
	print("\n📦 Phase 4: Dataset Validation")
	dataset_result = self.agents["dataset"].run()
	self.results["dataset"] = dataset_result

	# Run model verification
	print("\n🤖 Phase 5: Model Verification")
	model_result = self.agents["model"].run()
	self.results["model"] = model_result

	# Run evaluation specialist
	print("\n📈 Phase 6: Evaluation (Metrics & Quality)")
	curated = curation_result.data.get("examples", []) if curation_result.data else []
	eval_result = self.agents["evaluation"].run(predictions=curated)
	self.results["evaluation"] = eval_result

	# Run demo sync if curation succeeded
	print("\n🌐 Phase 7: Demo Sync")
	demo_result = self.agents["demo"].run(curated)
	self.results["demo"] = demo_result

	return self.generate_report()

	def generate_report(self) -> Dict:
	"""Generate comprehensive status report."""
	report = {
	"timestamp": datetime.now().isoformat(),
	"cycle": self.run_count,
	"overall_status": self._compute_overall_status(),
	"agents": {}
	}

	for name, result in self.results.items():
	report["agents"][name] = {
	"status": result.status.value,
	"message": result.message
	}

	return report

	def _compute_overall_status(self) -> str:
	"""Compute overall pipeline status."""
	statuses = [r.status for r in self.results.values()]

	if all(s == AgentStatus.SUCCESS for s in statuses):
	return "COMPLETE"
	elif any(s == AgentStatus.FAILED for s in statuses):
	return "NEEDS_ATTENTION"
	elif any(s == AgentStatus.WAITING for s in statuses):
	return "IN_PROGRESS"
	else:
	return "UNKNOWN"

	def print_summary(self, report: Dict):
	"""Print human-readable summary."""
	print(f"\n{'='*60}")
	print("📋 ORCHESTRATION SUMMARY")
	print(f"{'='*60}")
	print(f"Time: {report['timestamp']}")
	print(f"Cycle: {report['cycle']}")
	print(f"Overall: {report['overall_status']}")
	print("\nAgent Status:")
	for name, info in report["agents"].items():
	icon = {"success": "✅", "failed": "❌", "waiting": "⏳", "running": "🔄"}.get(info["status"], "❓")
	print(f" {icon} {name}: {info['status']} - {info['message'][:50]}")

	# ============================================================
	# Main
	# ============================================================

	def main():
	print("🚀 PitVQA Multi-Agent Orchestrator Starting...")

	# Current job IDs
	job_ids = [
	"696cfe9946affbb321046bd9", # Curation job
	"696cfebf57a10a9d296ca042", # Merge job
	]

	orchestrator = PitVQAOrchestrator(job_ids)

	# Run orchestration cycle
	report = orchestrator.run_cycle()
	orchestrator.print_summary(report)

	# Save report
	with open("orchestration_report.json", "w") as f:
	json.dump(report, f, indent=2)
	print(f"\n💾 Report saved to orchestration_report.json")

	return report

	if __name__ == "__main__":
	main()