| |
| """ |
| CompI Phase 1.D: Command-Line Quality Evaluation Tool |
| |
| Command-line interface for batch evaluation and analysis of generated images. |
| |
| Usage: |
| python src/generators/compi_phase1d_cli_evaluation.py --help |
| python src/generators/compi_phase1d_cli_evaluation.py --analyze |
| python src/generators/compi_phase1d_cli_evaluation.py --batch-score 4 3 4 4 3 |
| """ |
|
|
| import os |
| import argparse |
| import json |
| from datetime import datetime |
| from pathlib import Path |
| from typing import Dict, List |
|
|
| import pandas as pd |
| from PIL import Image |
|
|
| |
| from compi_phase1d_evaluate_quality import ( |
| parse_filename, get_image_metrics, load_existing_evaluations, |
| save_evaluation, EVALUATION_CRITERIA, OUTPUT_DIR, EVAL_CSV |
| ) |
|
|
| def setup_args(): |
| """Setup command line arguments.""" |
| parser = argparse.ArgumentParser( |
| description="CompI Phase 1.D: Command-Line Quality Evaluation", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=""" |
| Examples: |
| # Analyze existing evaluations |
| python %(prog)s --analyze |
| |
| # Batch score all unevaluated images (prompt_match, style, mood, quality, appeal) |
| python %(prog)s --batch-score 4 3 4 4 3 --notes "Batch evaluation - good quality" |
| |
| # Generate detailed report |
| python %(prog)s --report --output evaluation_report.txt |
| |
| # List unevaluated images |
| python %(prog)s --list-unevaluated |
| """ |
| ) |
| |
| parser.add_argument("--output-dir", default=OUTPUT_DIR, |
| help="Directory containing generated images") |
| |
| |
| parser.add_argument("--analyze", action="store_true", |
| help="Display evaluation summary and statistics") |
| |
| parser.add_argument("--report", action="store_true", |
| help="Generate detailed evaluation report") |
| |
| parser.add_argument("--output", "-o", |
| help="Output file for report (default: stdout)") |
| |
| |
| parser.add_argument("--batch-score", nargs=5, type=int, metavar=("PROMPT", "STYLE", "MOOD", "QUALITY", "APPEAL"), |
| help="Batch score all unevaluated images (1-5 for each criteria)") |
| |
| parser.add_argument("--notes", default="CLI batch evaluation", |
| help="Notes for batch evaluation") |
| |
| |
| parser.add_argument("--list-all", action="store_true", |
| help="List all images with evaluation status") |
| |
| parser.add_argument("--list-evaluated", action="store_true", |
| help="List only evaluated images") |
| |
| parser.add_argument("--list-unevaluated", action="store_true", |
| help="List only unevaluated images") |
| |
| |
| parser.add_argument("--style", help="Filter by style") |
| parser.add_argument("--mood", help="Filter by mood") |
| |
| return parser.parse_args() |
|
|
| def load_images(output_dir: str) -> List[Dict]: |
| """Load and parse all images from output directory.""" |
| if not os.path.exists(output_dir): |
| print(f"❌ Output directory '{output_dir}' not found!") |
| return [] |
| |
| image_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.png')] |
| parsed_images = [] |
| |
| for fname in image_files: |
| metadata = parse_filename(fname) |
| if metadata: |
| parsed_images.append(metadata) |
| |
| return parsed_images |
|
|
| def filter_images(images: List[Dict], style: str = None, mood: str = None) -> List[Dict]: |
| """Filter images by style and/or mood.""" |
| filtered = images |
| |
| if style: |
| filtered = [img for img in filtered if img.get('style', '').lower() == style.lower()] |
| |
| if mood: |
| filtered = [img for img in filtered if img.get('mood', '').lower() == mood.lower()] |
| |
| return filtered |
|
|
| def analyze_evaluations(existing_evals: Dict): |
| """Display evaluation analysis.""" |
| if not existing_evals: |
| print("❌ No evaluations found.") |
| return |
| |
| df = pd.DataFrame.from_dict(existing_evals, orient='index') |
| |
| print("📊 CompI Phase 1.D - Evaluation Analysis") |
| print("=" * 50) |
| print(f"Total Evaluated Images: {len(df)}") |
| print() |
| |
| |
| print("📈 Score Statistics:") |
| for criterion_key, criterion_info in EVALUATION_CRITERIA.items(): |
| if criterion_key in df.columns: |
| mean_score = df[criterion_key].mean() |
| std_score = df[criterion_key].std() |
| min_score = df[criterion_key].min() |
| max_score = df[criterion_key].max() |
| |
| print(f" {criterion_info['name']:20}: {mean_score:.2f} ± {std_score:.2f} (range: {min_score}-{max_score})") |
| |
| print() |
| |
| |
| if 'style' in df.columns and 'prompt_match' in df.columns: |
| print("🎨 Top Performing Styles (by Prompt Match):") |
| style_scores = df.groupby('style')['prompt_match'].mean().sort_values(ascending=False) |
| for style, score in style_scores.head(5).items(): |
| print(f" {style:15}: {score:.2f}") |
| print() |
| |
| |
| if 'mood' in df.columns and 'creative_appeal' in df.columns: |
| print("🌟 Top Performing Moods (by Creative Appeal):") |
| mood_scores = df.groupby('mood')['creative_appeal'].mean().sort_values(ascending=False) |
| for mood, score in mood_scores.head(5).items(): |
| print(f" {mood:15}: {score:.2f}") |
| print() |
|
|
| def generate_detailed_report(existing_evals: Dict) -> str: |
| """Generate detailed evaluation report.""" |
| if not existing_evals: |
| return "No evaluations found." |
| |
| df = pd.DataFrame.from_dict(existing_evals, orient='index') |
| |
| report_lines = [ |
| "# CompI Phase 1.D - Detailed Evaluation Report", |
| f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", |
| f"Total Images Evaluated: {len(df)}", |
| "", |
| "## Overall Performance Summary" |
| ] |
| |
| |
| for criterion_key, criterion_info in EVALUATION_CRITERIA.items(): |
| if criterion_key in df.columns: |
| mean_score = df[criterion_key].mean() |
| std_score = df[criterion_key].std() |
| report_lines.append(f"- **{criterion_info['name']}**: {mean_score:.2f} ± {std_score:.2f}") |
| |
| |
| report_lines.extend([ |
| "", |
| "## Score Distribution Analysis" |
| ]) |
| |
| for criterion_key, criterion_info in EVALUATION_CRITERIA.items(): |
| if criterion_key in df.columns: |
| scores = df[criterion_key] |
| report_lines.extend([ |
| f"", |
| f"### {criterion_info['name']}", |
| f"- Mean: {scores.mean():.2f}", |
| f"- Median: {scores.median():.2f}", |
| f"- Mode: {scores.mode().iloc[0] if not scores.mode().empty else 'N/A'}", |
| f"- Range: {scores.min()}-{scores.max()}", |
| f"- Distribution: " + " | ".join([f"{i}★: {(scores == i).sum()}" for i in range(1, 6)]) |
| ]) |
| |
| |
| if 'style' in df.columns: |
| report_lines.extend([ |
| "", |
| "## Style Performance Analysis" |
| ]) |
| |
| for criterion_key in EVALUATION_CRITERIA.keys(): |
| if criterion_key in df.columns: |
| style_performance = df.groupby('style')[criterion_key].agg(['mean', 'count']).sort_values('mean', ascending=False) |
| report_lines.extend([ |
| f"", |
| f"### {EVALUATION_CRITERIA[criterion_key]['name']} by Style", |
| ]) |
| |
| for style, (mean_score, count) in style_performance.iterrows(): |
| report_lines.append(f"- {style}: {mean_score:.2f} (n={count})") |
| |
| |
| report_lines.extend([ |
| "", |
| "## Recommendations", |
| "", |
| "### Areas for Improvement" |
| ]) |
| |
| |
| criterion_means = {} |
| for criterion_key, criterion_info in EVALUATION_CRITERIA.items(): |
| if criterion_key in df.columns: |
| criterion_means[criterion_info['name']] = df[criterion_key].mean() |
| |
| if criterion_means: |
| lowest_criteria = sorted(criterion_means.items(), key=lambda x: x[1])[:2] |
| for criterion_name, score in lowest_criteria: |
| report_lines.append(f"- Focus on improving **{criterion_name}** (current: {score:.2f}/5)") |
| |
| report_lines.extend([ |
| "", |
| "### Best Practices", |
| "- Continue systematic evaluation for trend analysis", |
| "- Experiment with parameter adjustments for low-scoring areas", |
| "- Consider A/B testing different generation approaches", |
| "- Document successful style/mood combinations for reuse" |
| ]) |
| |
| return "\n".join(report_lines) |
|
|
| def batch_evaluate_images(images: List[Dict], scores: List[int], notes: str, output_dir: str): |
| """Batch evaluate unevaluated images.""" |
| existing_evals = load_existing_evaluations() |
| unevaluated = [img for img in images if img['filename'] not in existing_evals] |
| |
| if not unevaluated: |
| print("✅ All images are already evaluated!") |
| return |
| |
| print(f"📦 Batch evaluating {len(unevaluated)} images...") |
| |
| |
| criteria_keys = list(EVALUATION_CRITERIA.keys()) |
| score_dict = dict(zip(criteria_keys, scores)) |
| |
| for i, img_data in enumerate(unevaluated): |
| fname = img_data["filename"] |
| img_path = os.path.join(output_dir, fname) |
| |
| try: |
| metrics = get_image_metrics(img_path) |
| save_evaluation(fname, img_data, score_dict, notes, metrics) |
| print(f" ✅ Evaluated: {fname}") |
| except Exception as e: |
| print(f" ❌ Error evaluating {fname}: {e}") |
| |
| print(f"🎉 Batch evaluation completed!") |
|
|
| def list_images(images: List[Dict], existing_evals: Dict, show_evaluated: bool = True, show_unevaluated: bool = True): |
| """List images with evaluation status.""" |
| print(f"📋 Image List ({len(images)} total)") |
| print("-" * 80) |
| |
| for img_data in images: |
| fname = img_data["filename"] |
| is_evaluated = fname in existing_evals |
| |
| if (show_evaluated and is_evaluated) or (show_unevaluated and not is_evaluated): |
| status = "✅" if is_evaluated else "❌" |
| prompt = img_data.get('prompt', 'unknown')[:30] |
| style = img_data.get('style', 'unknown')[:15] |
| mood = img_data.get('mood', 'unknown')[:15] |
| |
| print(f"{status} {fname}") |
| print(f" Prompt: {prompt}... | Style: {style} | Mood: {mood}") |
| |
| if is_evaluated: |
| eval_data = existing_evals[fname] |
| scores = [f"{k}:{eval_data.get(k, 'N/A')}" for k in EVALUATION_CRITERIA.keys() if k in eval_data] |
| print(f" Scores: {' | '.join(scores[:3])}...") |
| print() |
|
|
| def main(): |
| """Main CLI function.""" |
| args = setup_args() |
| |
| |
| images = load_images(args.output_dir) |
| if not images: |
| return |
| |
| |
| images = filter_images(images, args.style, args.mood) |
| |
| |
| existing_evals = load_existing_evaluations() |
| |
| |
| if args.analyze: |
| analyze_evaluations(existing_evals) |
| |
| elif args.report: |
| report = generate_detailed_report(existing_evals) |
| if args.output: |
| with open(args.output, 'w', encoding='utf-8') as f: |
| f.write(report) |
| print(f"📄 Report saved to: {args.output}") |
| else: |
| print(report) |
| |
| elif args.batch_score: |
| batch_evaluate_images(images, args.batch_score, args.notes, args.output_dir) |
| |
| elif args.list_all: |
| list_images(images, existing_evals, True, True) |
| |
| elif args.list_evaluated: |
| list_images(images, existing_evals, True, False) |
| |
| elif args.list_unevaluated: |
| list_images(images, existing_evals, False, True) |
| |
| else: |
| print("❓ No command specified. Use --help for usage information.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|