Spaces:

minhan6559
/

Log-Analysis-MultiAgent

Sleeping

App Files Files Community

Log-Analysis-MultiAgent / src /evaluation /full_pipeline /evaluate_metrics.py

minhan6559

Upload 101 files

e4932aa verified 6 months ago

raw

history blame

16.3 kB

	#!/usr/bin/env python3
	"""
	Evaluate system performance metrics.

	Calculates detection rates, coverage, accuracy, and overall effectiveness
	based on tactic occurrence counts. Generates separate reports for each model.

	Usage:
	python evaluate_metrics.py [--input INPUT_PATH] [--output OUTPUT_PATH]
	"""
	import argparse
	import json
	from pathlib import Path
	from typing import Dict, List, Any
	from datetime import datetime
	import statistics


	class SystemEvaluator:
	"""Evaluates multi-agent system performance"""

	def __init__(self, tactic_counts_file: Path):
	self.tactic_counts_file = tactic_counts_file
	self.tactic_data = []
	self.load_tactic_counts()

	def load_tactic_counts(self):
	"""Load tactic counts summary data"""
	if not self.tactic_counts_file.exists():
	raise FileNotFoundError(
	f"Tactic counts file not found: {self.tactic_counts_file}"
	)

	data = json.loads(self.tactic_counts_file.read_text(encoding="utf-8"))
	self.tactic_data = data.get("results", [])
	print(f"[INFO] Loaded {len(self.tactic_data)} tactic analysis results")

	def group_by_model(self) -> Dict[str, List[Dict]]:
	"""Group tactic data by model"""
	models = {}
	for item in self.tactic_data:
	model = item["model"]
	if model not in models:
	models[model] = []
	models[model].append(item)
	return models

	def calculate_detection_rate(self, model_data: List[Dict] = None) -> Dict[str, Any]:
	"""Calculate detection rate: % of files where tactic was correctly detected"""
	data_to_use = model_data if model_data is not None else self.tactic_data

	# Aggregate by tactic
	tactic_aggregates = {}
	for item in data_to_use:
	tactic = item["tactic"]
	if tactic not in tactic_aggregates:
	tactic_aggregates[tactic] = {
	"total_files": 0,
	"files_detected": 0,
	"total_events": 0,
	}
	tactic_aggregates[tactic]["total_files"] += 1
	tactic_aggregates[tactic]["files_detected"] += item["tactic_detected"]
	tactic_aggregates[tactic]["total_events"] += item[
	"total_abnormal_events_detected"
	]

	total_files = sum(agg["total_files"] for agg in tactic_aggregates.values())
	total_detected = sum(
	agg["files_detected"] for agg in tactic_aggregates.values()
	)
	total_events = sum(agg["total_events"] for agg in tactic_aggregates.values())

	per_tactic_detection = []
	for tactic, agg in sorted(tactic_aggregates.items()):
	files = agg["total_files"]
	detected = agg["files_detected"]
	events = agg["total_events"]

	detection_rate = (detected / files * 100) if files > 0 else 0.0

	per_tactic_detection.append(
	{
	"tactic": tactic,
	"total_files": files,
	"files_detected": detected,
	"files_missed": files - detected,
	"total_abnormal_events_detected": events,
	"detection_rate_percent": detection_rate,
	"status": (
	"GOOD"
	if detection_rate >= 50
	else ("POOR" if detection_rate > 0 else "NONE")
	),
	}
	)

	overall_detection_rate = (
	(total_detected / total_files * 100) if total_files > 0 else 0.0
	)

	return {
	"overall_detection_rate_percent": overall_detection_rate,
	"total_files": total_files,
	"total_files_detected": total_detected,
	"total_files_missed": total_files - total_detected,
	"total_abnormal_events_detected": total_events,
	"total_tactics": len(tactic_aggregates),
	"per_tactic_detection": per_tactic_detection,
	}

	def calculate_coverage(self, model_data: List[Dict] = None) -> Dict[str, Any]:
	"""Calculate coverage: how many tactics have at least one successful detection"""
	data_to_use = model_data if model_data is not None else self.tactic_data

	# Aggregate by tactic
	tactic_aggregates = {}
	for item in data_to_use:
	tactic = item["tactic"]
	if tactic not in tactic_aggregates:
	tactic_aggregates[tactic] = 0
	tactic_aggregates[tactic] += item["tactic_detected"]

	total_tactics = len(tactic_aggregates)
	tactics_with_detection = sum(
	1 for count in tactic_aggregates.values() if count > 0
	)
	tactics_with_zero_detection = total_tactics - tactics_with_detection

	coverage_percent = (
	(tactics_with_detection / total_tactics * 100) if total_tactics > 0 else 0.0
	)

	detected_tactics = sorted(
	[tactic for tactic, count in tactic_aggregates.items() if count > 0]
	)
	missed_tactics = sorted(
	[tactic for tactic, count in tactic_aggregates.items() if count == 0]
	)

	return {
	"coverage_percent": coverage_percent,
	"total_tactics_tested": total_tactics,
	"tactics_with_detection": tactics_with_detection,
	"tactics_with_zero_detection": tactics_with_zero_detection,
	"detected_tactics": detected_tactics,
	"missed_tactics": missed_tactics,
	}

	def calculate_accuracy_proxy(self, model_data: List[Dict] = None) -> Dict[str, Any]:
	"""Calculate accuracy proxy: detection success rate per tactic"""
	data_to_use = model_data if model_data is not None else self.tactic_data

	# Aggregate by tactic
	tactic_aggregates = {}
	for item in data_to_use:
	tactic = item["tactic"]
	if tactic not in tactic_aggregates:
	tactic_aggregates[tactic] = {"total_files": 0, "files_detected": 0}
	tactic_aggregates[tactic]["total_files"] += 1
	tactic_aggregates[tactic]["files_detected"] += item["tactic_detected"]

	accuracy_scores = []
	for tactic, agg in sorted(tactic_aggregates.items()):
	if agg["total_files"] > 0:
	accuracy = agg["files_detected"] / agg["total_files"]
	accuracy_scores.append(
	{
	"tactic": tactic,
	"accuracy_score": accuracy,
	"interpretation": (
	"Perfect"
	if accuracy == 1.0
	else ("Partial" if accuracy > 0 else "Failed")
	),
	}
	)

	avg_accuracy = (
	statistics.mean([s["accuracy_score"] for s in accuracy_scores])
	if accuracy_scores
	else 0.0
	)

	return {
	"average_accuracy_score": avg_accuracy,
	"per_tactic_accuracy": accuracy_scores,
	"perfect_matches": sum(
	1 for s in accuracy_scores if s["accuracy_score"] == 1.0
	),
	"partial_matches": sum(
	1 for s in accuracy_scores if 0 < s["accuracy_score"] < 1.0
	),
	"failed_matches": sum(
	1 for s in accuracy_scores if s["accuracy_score"] == 0.0
	),
	}

	def calculate_effectiveness(self, model_data: List[Dict] = None) -> Dict[str, Any]:
	"""Calculate overall system effectiveness score (0-100)"""
	detection = self.calculate_detection_rate(model_data)
	coverage = self.calculate_coverage(model_data)
	accuracy = self.calculate_accuracy_proxy(model_data)

	# Weighted effectiveness score
	# 40% detection rate, 30% coverage, 30% accuracy
	effectiveness_score = (
	detection["overall_detection_rate_percent"] * 0.4
	+ coverage["coverage_percent"] * 0.3
	+ accuracy["average_accuracy_score"] * 100 * 0.3
	)

	# Grade the system
	if effectiveness_score >= 80:
	grade = "EXCELLENT"
	elif effectiveness_score >= 60:
	grade = "GOOD"
	elif effectiveness_score >= 40:
	grade = "FAIR"
	elif effectiveness_score >= 20:
	grade = "POOR"
	else:
	grade = "CRITICAL"

	return {
	"effectiveness_score": effectiveness_score,
	"grade": grade,
	"component_scores": {
	"detection_rate": detection["overall_detection_rate_percent"],
	"coverage_rate": coverage["coverage_percent"],
	"accuracy_score": accuracy["average_accuracy_score"] * 100,
	},
	}

	def identify_issues(self, model_data: List[Dict] = None) -> List[str]:
	"""Identify specific issues and gaps"""
	issues = []

	detection = self.calculate_detection_rate(model_data)
	coverage = self.calculate_coverage(model_data)

	# Check overall detection
	if detection["overall_detection_rate_percent"] < 20:
	issues.append(
	f"CRITICAL: Overall detection rate is only {detection['overall_detection_rate_percent']:.1f}%. "
	f"System is failing to detect most attacks ({detection['total_files_missed']}/{detection['total_files']} files missed)."
	)
	elif detection["overall_detection_rate_percent"] < 50:
	issues.append(
	f"WARNING: Detection rate is {detection['overall_detection_rate_percent']:.1f}%, "
	f"below acceptable threshold of 50% ({detection['total_files_missed']}/{detection['total_files']} files missed)."
	)

	# Check coverage
	if coverage["tactics_with_zero_detection"] > 0:
	missed = ", ".join(coverage["missed_tactics"])
	issues.append(
	f"COVERAGE GAP: {coverage['tactics_with_zero_detection']} tactics have zero detection: {missed}"
	)

	# Check for specific problematic tactics
	for item in detection["per_tactic_detection"]:
	if item["total_files"] > 0 and item["detection_rate_percent"] == 0:
	issues.append(
	f"TACTIC FAILURE: '{item['tactic']}' - "
	f"{item['total_files']} files analyzed, 0 detected"
	)

	# Check for data quality issues
	data_to_use = model_data if model_data is not None else self.tactic_data
	zero_event_tactics = [
	item["tactic"]
	for item in data_to_use
	if item["total_abnormal_events_detected"] == 0
	]
	if zero_event_tactics:
	unique_zero = list(set(zero_event_tactics))
	issues.append(
	f"DATA ISSUE: No events to analyze for tactics: {', '.join(unique_zero)}"
	)

	if not issues:
	issues.append(
	"No critical issues detected. System is performing within acceptable parameters."
	)

	return issues

	def run_evaluation_for_model(
	self, model_name: str, model_data: List[Dict]
	) -> Dict[str, Any]:
	"""Run full evaluation for a specific model"""
	print(f"\nEvaluating model: {model_name} ({len(model_data)} files)")

	detection = self.calculate_detection_rate(model_data)
	coverage = self.calculate_coverage(model_data)
	accuracy = self.calculate_accuracy_proxy(model_data)
	effectiveness = self.calculate_effectiveness(model_data)
	issues = self.identify_issues(model_data)

	report = {
	"timestamp": datetime.now().isoformat(),
	"model_name": model_name,
	"evaluation_metrics": {
	"detection_rate": detection,
	"coverage": coverage,
	"accuracy_proxy": accuracy,
	"effectiveness": effectiveness,
	},
	"issues_identified": issues,
	}

	return report

	def run_evaluation(self) -> Dict[str, Any]:
	"""Run full evaluation and compile report for all models"""
	print("\n" + "=" * 80)
	print("RUNNING SYSTEM EVALUATION")
	print("=" * 80 + "\n")

	# Group data by model
	models_data = self.group_by_model()

	if not models_data:
	print("[WARNING] No model data found")
	return {"error": "No model data found"}

	print(f"Found {len(models_data)} models: {', '.join(models_data.keys())}")

	# Generate reports for each model
	model_reports = {}
	for model_name, model_data in models_data.items():
	print(f"\nProcessing model: {model_name}")
	model_reports[model_name] = self.run_evaluation_for_model(
	model_name, model_data
	)

	# Create summary report
	summary_report = {
	"timestamp": datetime.now().isoformat(),
	"total_models_evaluated": len(model_reports),
	"models": list(model_reports.keys()),
	"model_reports": model_reports,
	}

	return summary_report


	def main():
	parser = argparse.ArgumentParser(
	description="Evaluate multi-agent system performance"
	)
	parser.add_argument(
	"--input",
	default="evaluation/full_pipeline/results/tactic_counts_summary.json",
	help="Path to tactic_counts_summary.json",
	)
	parser.add_argument(
	"--output",
	default="evaluation/full_pipeline/results/evaluation_report.json",
	help="Output file for evaluation report",
	)
	args = parser.parse_args()

	input_path = Path(args.input)
	output_path = Path(args.output)

	if not input_path.exists():
	print(f"[ERROR] Input file not found: {input_path}")
	print("Run count_tactics.py first to generate tactic counts")
	return 1

	# Run evaluation
	evaluator = SystemEvaluator(input_path)
	report = evaluator.run_evaluation()

	if "error" in report:
	print(f"[ERROR] {report['error']}")
	return 1

	# Save main report
	output_path.parent.mkdir(parents=True, exist_ok=True)
	output_path.write_text(json.dumps(report, indent=2), encoding="utf-8")

	# Save individual model reports
	for model_name, model_report in report["model_reports"].items():
	model_output_path = (
	output_path.parent
	/ f"evaluation_report_{model_name.replace(':', '_').replace('/', '_')}.json"
	)
	model_output_path.write_text(
	json.dumps(model_report, indent=2), encoding="utf-8"
	)
	print(f"Model report saved: {model_output_path}")

	# Display summary
	print("\n" + "=" * 80)
	print("EVALUATION COMPLETE")
	print("=" * 80)
	print(f"Models evaluated: {report['total_models_evaluated']}")
	print(f"Models: {', '.join(report['models'])}")

	# Show summary for each model
	for model_name, model_report in report["model_reports"].items():
	effectiveness = model_report["evaluation_metrics"]["effectiveness"]
	print(f"\n{model_name}:")
	print(f" Effectiveness Score: {effectiveness['effectiveness_score']:.1f}/100")
	print(f" Grade: {effectiveness['grade']}")
	print(
	f" Detection Rate: {effectiveness['component_scores']['detection_rate']:.1f}%"
	)
	print(f" Coverage: {effectiveness['component_scores']['coverage_rate']:.1f}%")
	print(f" Accuracy: {effectiveness['component_scores']['accuracy_score']:.1f}%")

	print(f"\nMain report saved to: {output_path}")
	print("=" * 80 + "\n")

	return 0


	if __name__ == "__main__":
	exit(main())