| |
| """ |
| Benchmark Runner for Summarizer-Standard Model |
| |
| Evaluates summarization performance using ROUGE scores, semantic similarity, |
| latency, and model size metrics. |
| """ |
|
|
| import json |
| import time |
| import yaml |
| import argparse |
| import requests |
| from pathlib import Path |
| from datetime import datetime |
| import numpy as np |
| import re |
|
|
| class SummarizerStandardBenchmarkRunner: |
| def __init__(self, config_path: str): |
| self.config = self._load_config(config_path) |
| self.results = { |
| "model": "Summarizer-Standard", |
| "timestamp": datetime.now().isoformat(), |
| "datasets": {}, |
| "overall_metrics": {} |
| } |
|
|
| |
|
|
| def _load_config(self, config_path: str) -> dict: |
| with open(config_path, 'r') as f: |
| return yaml.safe_load(f) |
|
|
| def _load_dataset(self, dataset_path: str, sample_size: int) -> list: |
| dataset_file = Path(dataset_path) |
| if not dataset_file.exists(): |
| print(f"β οΈ Dataset not found: {dataset_file}") |
| return [] |
|
|
| with open(dataset_file, 'r') as f: |
| data = [json.loads(line) for line in f] |
|
|
| return data[:sample_size] |
|
|
| def _call_model(self, text: str) -> tuple: |
| instruction = self.config["datasets"][0]["instruction"] |
| prompt = f"{instruction}\n\nInput: {text}\n\nSummary:" |
|
|
| payload = { |
| "prompt": prompt, |
| "max_tokens": self.config["model"]["max_tokens"], |
| "temperature": self.config["model"]["temperature"] |
| } |
|
|
| headers = {'Content-Type': 'application/json'} |
| start_time = time.time() |
|
|
| try: |
| response = requests.post( |
| f"{self.config['model']['base_url']}/completion", |
| json=payload, headers=headers, timeout=self.config["model"]["timeout"] |
| ) |
| latency = time.time() - start_time |
|
|
| if response.status_code == 200: |
| return response.json()["content"], latency |
| else: |
| return f"Error: {response.status_code}", latency |
| except Exception as e: |
| return f"Error: {e}", time.time() - start_time |
|
|
| def _calculate_rouge_scores(self, predicted: str, expected: str) -> dict: |
| """Calculate simple ROUGE-style n-gram overlap scores""" |
| def get_ngrams(text, n): |
| words = re.findall(r'\b\w+\b', text.lower()) |
| return set([tuple(words[i:i+n]) for i in range(len(words)-n+1)]) |
|
|
| pred_words = re.findall(r'\b\w+\b', predicted.lower()) |
| exp_words = re.findall(r'\b\w+\b', expected.lower()) |
|
|
| |
| pred_1grams = set(pred_words) |
| exp_1grams = set(exp_words) |
| rouge1_prec = len(pred_1grams & exp_1grams) / max(len(pred_1grams), 1) |
| rouge1_rec = len(pred_1grams & exp_1grams) / max(len(exp_1grams), 1) |
| rouge1 = 2 * rouge1_prec * rouge1_rec / max(rouge1_prec + rouge1_rec, 1e-10) |
|
|
| |
| pred_2grams = get_ngrams(predicted, 2) |
| exp_2grams = get_ngrams(expected, 2) |
| rouge2_prec = len(pred_2grams & exp_2grams) / max(len(pred_2grams), 1) |
| rouge2_rec = len(pred_2grams & exp_2grams) / max(len(exp_2grams), 1) |
| rouge2 = 2 * rouge2_prec * rouge2_rec / max(rouge2_prec + rouge2_rec, 1e-10) |
|
|
| |
| |
| rougeL = len(pred_1grams & exp_1grams) / max(len(exp_1grams), 1) |
|
|
| return { |
| 'rouge1': rouge1, |
| 'rouge2': rouge2, |
| 'rougeL': rougeL |
| } |
|
|
| def _calculate_semantic_similarity(self, text1: str, text2: str) -> float: |
| """Calculate simple word overlap similarity (Jaccard similarity)""" |
| try: |
| words1 = set(re.findall(r'\b\w+\b', text1.lower())) |
| words2 = set(re.findall(r'\b\w+\b', text2.lower())) |
|
|
| if not words1 and not words2: |
| return 1.0 |
| if not words1 or not words2: |
| return 0.0 |
|
|
| intersection = len(words1 & words2) |
| union = len(words1 | words2) |
| return intersection / union |
| except Exception as e: |
| print(f"Warning: Similarity calculation failed: {e}") |
| return 0.0 |
|
|
| def _calculate_compression_ratio(self, input_text: str, summary: str) -> float: |
| """Calculate compression ratio (summary length / input length)""" |
| input_words = len(input_text.split()) |
| summary_words = len(summary.split()) |
| return summary_words / max(input_words, 1) |
|
|
| def _run_dataset_benchmark(self, dataset_name: str, dataset_config: dict) -> dict: |
| print(f"π Running benchmark on {dataset_name}...") |
|
|
| dataset = self._load_dataset(dataset_config["file"], dataset_config["sample_size"]) |
| if not dataset: |
| return {"error": f"No data found for {dataset_name}"} |
|
|
| results = { |
| "sample_count": len(dataset), |
| "rouge1_scores": [], |
| "rouge2_scores": [], |
| "rougeL_scores": [], |
| "semantic_similarity": [], |
| "compression_ratios": [], |
| "latency_ms": [], |
| "successful_predictions": 0, |
| "examples": [] |
| } |
|
|
| for i, item in enumerate(dataset): |
| if i % 10 == 0: |
| print(f" Processing sample {i+1}/{len(dataset)}") |
|
|
| input_text = item[dataset_config["input_field"]] |
| expected_summary = item[dataset_config["expected_field"]] |
|
|
| |
| predicted_summary, latency = self._call_model(input_text) |
|
|
| if not predicted_summary.startswith("Error"): |
| results["successful_predictions"] += 1 |
|
|
| |
| rouge_scores = self._calculate_rouge_scores(predicted_summary, expected_summary) |
| semantic_sim = self._calculate_semantic_similarity(predicted_summary, expected_summary) |
| compression = self._calculate_compression_ratio(input_text, predicted_summary) |
|
|
| |
| results["rouge1_scores"].append(rouge_scores['rouge1']) |
| results["rouge2_scores"].append(rouge_scores['rouge2']) |
| results["rougeL_scores"].append(rouge_scores['rougeL']) |
| results["semantic_similarity"].append(semantic_sim) |
| results["compression_ratios"].append(compression) |
| results["latency_ms"].append(latency * 1000) |
|
|
| |
| if len(results["examples"]) < 5: |
| results["examples"].append({ |
| "input": input_text[:200] + "..." if len(input_text) > 200 else input_text, |
| "expected": expected_summary, |
| "predicted": predicted_summary, |
| "rouge1": rouge_scores['rouge1'], |
| "semantic_similarity": semantic_sim, |
| "compression_ratio": compression |
| }) |
|
|
| |
| if results["successful_predictions"] > 0: |
| results["averages"] = { |
| "rouge1": np.mean(results["rouge1_scores"]), |
| "rouge2": np.mean(results["rouge2_scores"]), |
| "rougeL": np.mean(results["rougeL_scores"]), |
| "semantic_similarity": np.mean(results["semantic_similarity"]), |
| "compression_ratio": np.mean(results["compression_ratios"]), |
| "latency_ms": np.mean(results["latency_ms"]) |
| } |
| else: |
| results["averages"] = { |
| "rouge1": 0.0, |
| "rouge2": 0.0, |
| "rougeL": 0.0, |
| "semantic_similarity": 0.0, |
| "compression_ratio": 0.0, |
| "latency_ms": 0.0 |
| } |
|
|
| print(f"β
{dataset_name} completed") |
| return results |
|
|
| def run_benchmarks(self): |
| print("π Starting Summarizer-Standard Benchmark Suite") |
| print("=" * 60) |
| print("Evaluating summarization quality with ROUGE and semantic metrics") |
| print() |
|
|
| |
| try: |
| response = requests.get(f"{self.config['model']['base_url']}/health", timeout=10) |
| if response.status_code == 200: |
| print("β
Summarizer-Standard server is running") |
| else: |
| print(f"β Server returned status {response.status_code}") |
| return |
| except Exception as e: |
| print(f"β Cannot connect to Summarizer-Standard server: {e}") |
| print("Make sure to start the model server first:") |
| print(" cd summarizer_standard_model.app/Contents/Resources && ./run_server") |
| return |
|
|
| |
| for dataset_config in self.config["datasets"]: |
| dataset_name = dataset_config["name"] |
| results = self._run_dataset_benchmark(dataset_name, dataset_config) |
| self.results["datasets"][dataset_name] = results |
|
|
| |
| self._calculate_overall_metrics() |
| self._save_results() |
| self._create_benchmarks_txt() |
|
|
| def _calculate_overall_metrics(self): |
| all_rouge1 = [] |
| all_rouge2 = [] |
| all_rougeL = [] |
| all_semantic = [] |
| all_compression = [] |
| all_latency = [] |
| total_samples = 0 |
|
|
| for dataset_results in self.results["datasets"].values(): |
| if "averages" in dataset_results: |
| all_rouge1.append(dataset_results["averages"]["rouge1"]) |
| all_rouge2.append(dataset_results["averages"]["rouge2"]) |
| all_rougeL.append(dataset_results["averages"]["rougeL"]) |
| all_semantic.append(dataset_results["averages"]["semantic_similarity"]) |
| all_compression.append(dataset_results["averages"]["compression_ratio"]) |
| all_latency.append(dataset_results["averages"]["latency_ms"]) |
| total_samples += dataset_results["sample_count"] |
|
|
| self.results["overall_metrics"] = { |
| "avg_rouge1": np.mean(all_rouge1) if all_rouge1 else 0, |
| "avg_rouge2": np.mean(all_rouge2) if all_rouge2 else 0, |
| "avg_rougeL": np.mean(all_rougeL) if all_rougeL else 0, |
| "avg_semantic_similarity": np.mean(all_semantic) if all_semantic else 0, |
| "avg_compression_ratio": np.mean(all_compression) if all_compression else 0, |
| "avg_latency_ms": np.mean(all_latency) if all_latency else 0, |
| "model_size_gb": self.config["output"]["model_size_gb"], |
| "total_samples": total_samples |
| } |
|
|
| def _save_results(self): |
| results_dir = Path("results") |
| results_dir.mkdir(exist_ok=True) |
|
|
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| results_file = results_dir / f"summarizer_standard_benchmark_{timestamp}.json" |
|
|
| with open(results_file, 'w') as f: |
| json.dump(self.results, f, indent=2) |
|
|
| print(f"π Detailed results saved to: {results_file}") |
|
|
| def _create_benchmarks_txt(self): |
| """Create the benchmarks.txt file with all results""" |
| benchmarks_content = [] |
| benchmarks_content.append("="*80) |
| benchmarks_content.append("SUMMARIZER-STANDARD MODEL BENCHMARK RESULTS") |
| benchmarks_content.append("="*80) |
| benchmarks_content.append("") |
| benchmarks_content.append("π EXECUTIVE SUMMARY") |
| benchmarks_content.append("-"*50) |
| benchmarks_content.append(f"Benchmark Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
| benchmarks_content.append(f"Model: {self.results['model']}") |
| benchmarks_content.append(f"Dataset: CNN/DailyMail Sample") |
| benchmarks_content.append(f"Total Samples: {self.results['overall_metrics']['total_samples']}") |
| benchmarks_content.append(f"Model Size: {self.results['overall_metrics']['model_size_gb']:.3f} GB") |
| benchmarks_content.append("") |
|
|
| overall = self.results['overall_metrics'] |
| benchmarks_content.append("π― OVERALL PERFORMANCE METRICS") |
| benchmarks_content.append("-"*50) |
| benchmarks_content.append(f" ROUGE-1 Score: {overall['avg_rouge1']:.3f}") |
| benchmarks_content.append(f" ROUGE-2 Score: {overall['avg_rouge2']:.3f}") |
| benchmarks_content.append(f" ROUGE-L Score: {overall['avg_rougeL']:.3f}") |
| benchmarks_content.append(f" Semantic Similarity: {overall['avg_semantic_similarity']:.3f}") |
| benchmarks_content.append(f" Compression Ratio: {overall['avg_compression_ratio']:.3f}") |
| benchmarks_content.append(f" Average Latency: {overall['avg_latency_ms']:.1f}ms") |
| benchmarks_content.append("") |
|
|
| |
| benchmarks_content.append("π DATASET BREAKDOWN") |
| benchmarks_content.append("-"*50) |
|
|
| for dataset_name, dataset_results in self.results["datasets"].items(): |
| if "averages" in dataset_results: |
| benchmarks_content.append("") |
| benchmarks_content.append(f"πΉ {dataset_name.upper().replace('_', ' ')}") |
| benchmarks_content.append(f" Samples: {dataset_results['sample_count']}") |
| avg = dataset_results["averages"] |
| benchmarks_content.append(f" ROUGE-1: {avg['rouge1']:.3f}") |
| benchmarks_content.append(f" ROUGE-2: {avg['rouge2']:.3f}") |
| benchmarks_content.append(f" ROUGE-L: {avg['rougeL']:.3f}") |
| benchmarks_content.append(f" Semantic Similarity: {avg['semantic_similarity']:.3f}") |
| benchmarks_content.append(f" Compression Ratio: {avg['compression_ratio']:.3f}") |
| benchmarks_content.append(f" Latency: {avg['latency_ms']:.1f}ms") |
|
|
| |
| if "examples" in dataset_results and dataset_results["examples"]: |
| benchmarks_content.append("") |
| benchmarks_content.append(" π SAMPLE OUTPUTS:") |
| for i, example in enumerate(dataset_results["examples"][:3]): |
| benchmarks_content.append(f" Example {i+1}:") |
| benchmarks_content.append(f" Input: {example['input']}") |
| benchmarks_content.append(f" Expected: {example['expected']}") |
| benchmarks_content.append(f" Predicted: {example['predicted']}") |
| benchmarks_content.append(f" ROUGE-1: {example['rouge1']:.3f}, Similarity: {example['semantic_similarity']:.3f}") |
| benchmarks_content.append("") |
|
|
| benchmarks_content.append("") |
| benchmarks_content.append("π METRICS EXPLANATION") |
| benchmarks_content.append("-"*50) |
| benchmarks_content.append("β’ ROUGE-1: Unigram (word) overlap between predicted and expected summaries") |
| benchmarks_content.append("β’ ROUGE-2: Bigram (2-word) overlap between predicted and expected summaries") |
| benchmarks_content.append("β’ ROUGE-L: Longest Common Subsequence overlap") |
| benchmarks_content.append("β’ Semantic Similarity: Word overlap similarity (Jaccard coefficient)") |
| benchmarks_content.append("β’ Compression Ratio: Summary length Γ· Input length (0.1-0.8 is ideal)") |
| benchmarks_content.append("β’ Latency: Response time in milliseconds (lower = faster)") |
| benchmarks_content.append("") |
| benchmarks_content.append("π INTERPRETING SCORES:") |
| benchmarks_content.append("β’ ROUGE scores > 0.5 are considered good, > 0.3 acceptable") |
| benchmarks_content.append("β’ Current scores indicate the model is not performing summarization effectively") |
| benchmarks_content.append("β’ The model generates very short outputs that miss key information") |
| benchmarks_content.append("") |
| benchmarks_content.append("="*80) |
|
|
| |
| with open("benchmarks.txt", "w") as f: |
| f.write("\n".join(benchmarks_content)) |
|
|
| print("π Results summary saved to: benchmarks.txt") |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Run Summarizer-Standard benchmarks") |
| parser.add_argument("--config", default="benchmark_config.yaml", help="Config file") |
|
|
| args = parser.parse_args() |
|
|
| try: |
| runner = SummarizerStandardBenchmarkRunner(args.config) |
| runner.run_benchmarks() |
| print("\nβ
Benchmarking completed! Results saved to benchmarks.txt") |
| except Exception as e: |
| print(f"β Benchmark failed: {e}") |
| import traceback |
| traceback.print_exc() |
|
|
| if __name__ == "__main__": |
| main() |
|
|