Spaces:

MCP-1st-Birthday
/

sentinel-finops

Sleeping

App Files Files Community

sentinel-finops / agent.py

akshay4

update agent

25c3a9a verified 5 months ago

raw

history blame contribute delete

39.3 kB

	import os
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	import json
	import pandas as pd
	from datetime import datetime
	from typing import Dict, List, Literal, Optional
	from typing_extensions import TypedDict
	from enum import Enum
	from pydantic import BaseModel, Field
	import re

	# Smolagents imports
	from smolagents import CodeAgent, tool, LiteLLMModel

	# OpenAI for direct API calls
	# OpenAI for direct API calls
	from openai import OpenAI
	import google.generativeai as genai

	# LlamaIndex for RAG
	from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
	from llama_index.llms.openai import OpenAI as LlamaOpenAI
	from llama_index.embeddings.openai import OpenAIEmbedding
	from dotenv import load_dotenv
	load_dotenv()

	# --- Configuration ---
	DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "gpt-4o-mini")
	REASONING_EFFORT = os.getenv("REASONING_EFFORT", "low")

	if "OPENAI_API_KEY" not in os.environ:
	print("WARNING: OPENAI_API_KEY not found in environment. Agent may fail.")

	if "GOOGLE_API_KEY" not in os.environ:
	print("WARNING: GOOGLE_API_KEY not found in environment. Gemini will fail.")

	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
	genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

	# Gemini Model Configuration
	GEMINI_MODEL = "gemini-2.5-flash"#"gemini-2.0-flash" #"gemini-2.0-flash-lite"#"gemini-2.0-flash-exp"

	# --- 1. ENUMS & PYDANTIC MODELS ---

	class TeamName(str, Enum):
	FINANCE = "Finance"
	DATA = "Data"
	QA = "QA"
	DEVOPS = "DevOps"
	SECURITY = "Security"
	FRONTEND = "Frontend"
	BACKEND = "Backend"
	INFRA = "Infrastructure"
	PLATFORM = "Platform"
	ML = "ML"
	UNKNOWN = "Unknown"

	class QueryIntent(BaseModel):
	"""Translates natural language into strict data filters."""
	target_services: Optional[List[str]] = Field(default_factory=list, description="List of services to filter (e.g. ['RDS', 'EBS', 'EC2'])")
	target_env: Optional[str] = Field(None, description="Environment to focus on (e.g. 'prod', 'dev')")
	target_team: Optional[str] = Field(None, description="Team to focus on (e.g. 'DevOps', 'ML', 'Data')")
	min_cost: Optional[float] = Field(0.0, description="Minimum monthly cost to filter for")
	only_orphans: bool = Field(False, description="True if user is looking for untagged/unowned resources")
	focus_area: Literal["SAVINGS", "SAFETY", "GENERAL", "TEAM_RANKING"] = Field("GENERAL", description="The user's primary goal")

	# TypedDict for Gemini Schema (Avoids 'default' error)
	class QueryIntentSchema(TypedDict):
	target_services: List[str]
	target_env: Optional[str]
	target_team: Optional[str]
	min_cost: float
	only_orphans: bool
	focus_area: Literal["SAVINGS", "SAFETY", "GENERAL", "TEAM_RANKING"]

	class ResourceClassification(BaseModel):
	"""Classification for a single resource."""
	resource_id: str
	action: Literal["KEEP", "DELETE", "HIBERNATE", "AUDIT"] = Field("AUDIT", description="Recommended action")
	reason: str = Field("No reason provided", description="Justification for the action")
	policy_rule: str = Field("Manual Review", description="Policy rule applied")
	inferred_team: Optional[TeamName] = Field(None, description="Inferred team ownership based on tags or resource name")

	class ClassificationResponse(BaseModel):
	"""Complete classification response for all resources."""
	classifications: list[ResourceClassification]

	# TypedDict for Gemini Schema
	class ResourceClassificationSchema(TypedDict):
	resource_id: str
	action: Literal["KEEP", "DELETE", "HIBERNATE", "AUDIT"]
	reason: str
	policy_rule: str
	inferred_team: Optional[TeamName]

	class ClassificationResponseSchema(TypedDict):
	classifications: List[ResourceClassificationSchema]

	# --- 2. REASONING LOGGER ---
	class ReasoningLogger:
	"""Captures step-by-step agent reasoning for transparency with optional streaming."""

	def __init__(self, log_callback=None):
	self.logs = []
	self.log_callback = log_callback

	def log(self, emoji, message):
	timestamp = datetime.now().strftime("%H:%M:%S")
	log_entry = f"[{timestamp}] {emoji} {message}"
	self.logs.append(log_entry)
	print(log_entry)
	if self.log_callback:
	self.log_callback(log_entry)

	def get_logs(self):
	return "\n".join(self.logs)

	# --- 3. RAG SETUP (POLICY ENGINE) ---
	def setup_policy_rag():
	try:
	# Use lightweight model for RAG
	Settings.llm = LlamaOpenAI(model="gpt-4o-mini")
	Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

	if os.path.exists("policy.txt"):
	documents = SimpleDirectoryReader(input_files=["policy.txt"]).load_data()
	index = VectorStoreIndex.from_documents(documents)
	query_engine = index.as_query_engine(similarity_top_k=3)
	return query_engine
	return None
	except Exception as e:
	print(f"Warning: Policy RAG setup failed: {e}")
	return None

	policy_engine = setup_policy_rag()

	@tool
	def query_policy(question: str) -> str:
	"""Query enterprise policy using RAG.

	Args:
	question: The policy question to ask (e.g., 'What are the deletion rules?')
	"""
	if not policy_engine:
	return "Policy engine unavailable (policy.txt missing)."
	try:
	response = policy_engine.query(question)
	return str(response)
	except Exception as e:
	return f"Policy query error: {e}"

	# --- 4. HELPER FUNCTIONS ---
	def get_tag(tags_dict, keys, default=''):
	"""Robust case-insensitive tag retrieval."""
	if isinstance(keys, str):
	keys = [keys]
	keys_lower = [k.lower() for k in keys]
	for k, v in tags_dict.items():
	if k.lower() in keys_lower:
	return v
	return default

	def infer_team_from_tags(tags_dict, resource_id="") -> TeamName:
	"""Simple Python-based team inference from tags OR resource ID."""
	team_str = get_tag(tags_dict, ['team', 'Team', 'TEAM', 'owner_team']).lower()

	# If no tag, try to infer from ResourceID (Sherlock Mode)
	if not team_str and resource_id:
	if 'ml-' in resource_id or 'training' in resource_id: team_str = 'ml'
	elif 'db-' in resource_id or 'rds' in resource_id: team_str = 'data'
	elif 'web' in resource_id or 'frontend' in resource_id: team_str = 'frontend'
	elif 'api' in resource_id or 'backend' in resource_id: team_str = 'backend'
	elif 'test' in resource_id or 'qa' in resource_id: team_str = 'qa'

	# Map common variations to canonical TeamName
	if not team_str:
	return TeamName.UNKNOWN

	if 'finance' in team_str or 'fin' in team_str:
	return TeamName.FINANCE
	elif 'data' in team_str or 'science' in team_str:
	return TeamName.DATA
	elif 'ml' in team_str or 'ai' in team_str:
	return TeamName.ML
	elif 'qa' in team_str or 'test' in team_str or 'quality' in team_str:
	return TeamName.QA
	elif 'devops' in team_str or 'ops' in team_str or 'sre' in team_str:
	return TeamName.DEVOPS
	elif 'platform' in team_str:
	return TeamName.PLATFORM
	elif 'security' in team_str or 'sec' in team_str or 'infosec' in team_str:
	return TeamName.SECURITY
	elif 'frontend' in team_str or 'web' in team_str or 'ui' in team_str:
	return TeamName.FRONTEND
	elif 'backend' in team_str or 'api' in team_str:
	return TeamName.BACKEND
	elif 'infra' in team_str:
	return TeamName.INFRA
	else:
	return TeamName.UNKNOWN

	# --- 5. INTENT EXTRACTION & FILTERING (NEW) ---
	def extract_query_intent(query: str, logger) -> QueryIntent:
	"""Uses LLM to convert 'Find expensive databases' -> {'target_services': ['RDS'], 'min_cost': 500}"""
	if not query:
	return QueryIntent()

	logger.log("🧠", f"Translating query: '{query}'...")

	prompt = f"""
	You are a Data Query Translator for a Cloud FinOps Tool.
	Convert the user's natural language query into structured filters.

	User Query: "{query}"

	---------------------------------------------------------
	STRICT VOCABULARY MAPPING (DO NOT INVENT TERMS)
	---------------------------------------------------------
	1. SERVICES (Map keywords to EXACTLY one of these 5 options):
	- "database", "db", "sql", "rds", "postgres" -> ["RDS"]
	- "server", "vm", "instance", "ec2", "compute", "host" -> ["EC2"]
	- "volume", "disk", "storage", "ebs", "block store" -> ["EBS"]
	- "bucket", "object", "s3", "blob" -> ["S3"]
	- "function", "serverless", "lambda" -> ["Lambda"]
	* IF NO SPECIFIC SERVICE IS MENTIONED, RETURN [] (Empty List).

	2. ENVIRONMENTS (Map keywords to EXACTLY one of these 2 options):
	- "production", "prod", "live" -> "prod"
	- "development", "dev", "test", "staging", "qa" -> "dev"

	3. TEAMS (Map keywords to canonical team names):
	- "devops", "ops", "sre" -> "DevOps"
	- "ml", "ai", "machine learning" -> "ML"
	- "data", "analytics", "science" -> "Data"
	- "frontend", "web", "ui" -> "Frontend"
	- "backend", "api" -> "Backend"
	- "qa", "test", "quality" -> "QA"
	- "security", "infosec" -> "Security"
	- "platform", "infrastructure", "infra" -> "Platform"

	4. INTENT RULES:
	- "kill", "delete", "prune", "remove" -> focus_area="SAVINGS", min_cost=0.0
	- "waste", "zombie", "unused", "idle" -> focus_area="SAVINGS", min_cost=0.0
	- "expensive", "costly" -> focus_area="GENERAL", min_cost=100.0
	- "orphan", "untagged" -> only_orphans=True
	- "audit", "safety", "check" -> focus_area="SAFETY"
	- "biggest offender", "worst team", "team ranking", "team leaderboard" -> focus_area="TEAM_RANKING"

	OUTPUT FORMAT:
	Return valid JSON matching the schema.
	"""

	try:
	model = genai.GenerativeModel(GEMINI_MODEL)
	response = model.generate_content(
	prompt,
	generation_config=genai.GenerationConfig(
	response_mime_type="application/json",
	response_schema=QueryIntentSchema
	)
	)
	data = json.loads(response.text)
	intent = QueryIntent(**data)

	# UPDATED LOGGING: Now visualizes Env and Orphans
	env_str = intent.target_env if intent.target_env else "All"
	svc_str = str(intent.target_services) if intent.target_services else "All"
	team_str = intent.target_team if intent.target_team else "All"

	logger.log("🎯", f"Intent: {intent.focus_area} \| Team: {team_str} \| Env: {env_str} \| Svc: {svc_str} \| Cost > ${intent.min_cost} \| Orphans: {intent.only_orphans}")
	return intent
	except Exception as e:
	logger.log("⚠️", f"Intent extraction failed: {e}. Running full audit.")
	return QueryIntent()

	def filter_dataframe(df, intent: QueryIntent, logger):
	"""Applies the LLM-derived intent to the Pandas DataFrame."""
	initial_count = len(df)

	# 1. Service Filter
	if intent.target_services:
	df = df[df['Service'].isin(intent.target_services)]

	# 2. Environment Filter
	if intent.target_env:
	def check_env(tags_str):
	try:
	tags = json.loads(tags_str)
	env = get_tag(tags, ['env', 'environment'], '').lower()
	return intent.target_env in env
	except: return False
	df = df[df['Tags'].apply(check_env)]

	# 3. Cost Filter
	if intent.min_cost and intent.min_cost > 0:
	df = df[df['Cost_Monthly'] >= intent.min_cost]

	# 4. Team Filter
	if intent.target_team:
	def check_team(tags_str):
	try:
	tags = json.loads(tags_str)
	team = get_tag(tags, ['team', 'owner_team'], '').lower()
	return intent.target_team.lower() in team
	except: return False
	df = df[df['Tags'].apply(check_team)]

	# 5. Orphan Filter
	if intent.only_orphans:
	def check_orphan(tags_str):
	try:
	tags = json.loads(tags_str)
	# Orphan = No Team tag
	return not get_tag(tags, ['team', 'owner_team'])
	except: return True
	df = df[df['Tags'].apply(check_orphan)]

	logger.log("🔍", f"Scoped analysis from {initial_count} to {len(df)} resources.")
	return df

	# --- 6. CORE ANALYSIS LOGIC ---

	def classify_simple_cases(df, logger):
	"""Fast Python rules for obvious cases (90% of resources)."""
	logger.log("⚡", "Applying Python rules for simple cases...")

	simple_results = []
	complex_cases = []

	for idx, row in df.iterrows():
	resource = row.to_dict()
	try: tags = json.loads(resource.get('Tags', '{}'))
	except: tags = {}

	environment = get_tag(tags, ['env', 'environment', 'Environment']).lower()

	# Rule 1: Unattached EBS
	if (resource.get('Service') == 'EBS' and
	get_tag(tags, 'state') == 'available' and
	resource.get('CPU_avg', 0) == 0):
	simple_results.append({
	'resource': resource,
	'action': 'DELETE',
	'reason': 'Unattached EBS volume (no activity)',
	'policy_rule': 'Rule: Unattached volumes should be deleted',
	'inferred_team': infer_team_from_tags(tags, resource.get('ResourceID', ''))
	})
	continue

	# Rule 2: Obvious Keep (High CPU)
	if resource.get('CPU_avg', 0) > 75:
	simple_results.append({
	'resource': resource,
	'action': 'KEEP',
	'reason': f'High CPU utilization ({resource.get("CPU_avg")}%)',
	'policy_rule': 'High utilization indicates active use',
	'inferred_team': infer_team_from_tags(tags, resource.get('ResourceID', ''))
	})
	continue

	# Everything else is complex
	complex_cases.append(resource)

	logger.log("✅", f"Python rules: {len(simple_results)} simple, {len(complex_cases)} complex (need AI)")
	return simple_results, complex_cases

	@tool
	def classify_resources(resources_json: str, policy_context: str) -> ClassificationResponse:
	"""Classify cloud resources based on policy using strong AI.

	Args:
	resources_json: JSON string containing a list of resource dictionaries to classify.
	policy_context: Relevant policy text retrieved from the RAG engine.
	"""

	prompt = f"""You are Sentinal, an expert Cloud FinOps Auditor.
	Analyze the following resources against the policy.

	Policy Context:
	{policy_context}

	Input Resources:
	{resources_json}

	YOUR TASKS:
	1. DECIDE ACTION: Determine if the resource should be KEEP, DELETE, HIBERNATE, or AUDIT.
	- DELETE: Low CPU (<10%) + idle (>30 days) + dev/test environment
	- HIBERNATE: Low CPU (<10%) + idle (>30 days) + prod environment
	- KEEP: High CPU (>50%) OR active (<30 days) OR critical tags
	- AUDIT: Missing required tags (team, environment) OR ambiguous state

	2. SHERLOCK MODE (Team Inference):
	- PRIMARY: Check the 'Team' tag first. If present and valid, use it.
	- FALLBACK: If 'Team' tag is missing, empty, or generic, use your world knowledge to infer ownership.
	- Look at 'ResourceID', 'Name', 'Service', or any other clues.

	Inference Examples:
	- 'kafka', 'hadoop', 'spark', 'airflow', 'ml-', 'training', 'gpu' → ML or Data
	- 'jenkins', 'terraform', 'k8s', 'docker', 'cicd' → DevOps or Platform
	- 'web', 'frontend', 'react', 'vue', 'ui' → Frontend
	- 'api', 'backend', 'service', 'payment', 'catalogue' → Backend
	- 'test', 'qa', 'selenium' → QA
	- 'security', 'vault', 'secrets' → Security
	- 'tableau', 'looker', 'analytics' → Data

	Canonical Teams: Finance, Data, QA, DevOps, Security, Frontend, Backend, Infrastructure, Platform, ML, Unknown

	- Set 'inferred_team' to the most likely team based on your analysis.
	- If absolutely no clues exist, set 'inferred_team' to "Unknown".

	3. CRITICAL SAFETY RULES:
	- Production resources (env=prod) must NEVER be DELETE. Use HIBERNATE instead.
	- If unsure, choose AUDIT to flag for human review.

	OUTPUT:
	For EACH resource in the input, you MUST return:
	- resource_id: Copy the 'ResourceID' field from the input resource exactly as-is
	- action: Your decision (KEEP/DELETE/HIBERNATE/AUDIT)
	- reason: Brief justification
	- policy_rule: The policy rule that applies
	- inferred_team: The team name you inferred

	CRITICAL: The 'resource_id' field is MANDATORY. Copy it from the input 'ResourceID' field.

	Return valid JSON matching the ClassificationResponse schema with ALL fields populated."""

	try:
	model = genai.GenerativeModel(GEMINI_MODEL)
	response = model.generate_content(
	prompt,
	generation_config=genai.GenerationConfig(
	response_mime_type="application/json",
	response_schema=ClassificationResponseSchema
	)
	)
	# Convert to Pydantic
	data = json.loads(response.text)
	return ClassificationResponse(**data)
	except Exception as e:
	print(f"Gemini Error: {e}")
	# Fallback to empty list if fails
	return ClassificationResponse(classifications=[])

	def apply_safety_rules(resource, classification, logger):
	"""Hardcoded safety overrides."""
	tags = json.loads(resource.get("Tags", "{}"))
	env = get_tag(tags, ['env', 'environment']).lower()
	action = classification["action"]

	# CRITICAL: Prod Safety
	if "prod" in env and action == "DELETE":
	# logger.log("⚠️", f"{resource['ResourceID']}: Downgraded DELETE→HIBERNATE (prod safety rule)")
	return "HIBERNATE"

	return action

	# --- 7. REPORT GENERATORS (HTML & EMAILS) ---

	def generate_report(results, total_resources, total_spend):
	"""Generate HTML report with scrollable table and CFO Banner."""

	analyzed_spend = sum(r['resource']['Cost_Monthly'] for r in results)

	# Calculate Unallocated Spend
	unallocated_cost = 0
	savings = 0
	action_counts = {"KEEP": 0, "DELETE": 0, "HIBERNATE": 0, "AUDIT": 0}

	for r in results:
	res = r['resource']
	act = r['classification']['action']
	tags = json.loads(res.get('Tags', '{}'))

	if not get_tag(tags, 'team'):
	unallocated_cost += res['Cost_Monthly']

	if act in ['DELETE', 'HIBERNATE']:
	savings += res['Cost_Monthly']

	action_counts[act] = action_counts.get(act, 0) + 1

	# Waste CPU
	waste_items = [r for r in results if r['classification']['action'] in ['DELETE', 'HIBERNATE']]
	avg_waste_util = (sum(r['resource']['CPU_avg'] for r in waste_items) / len(waste_items)) if waste_items else 0

	# --- CHART DATA CALCULATIONS ---

	# 1. Action Breakdown (Cost by Action)
	action_costs = {"KEEP": 0, "DELETE": 0, "HIBERNATE": 0, "AUDIT": 0}
	for r in results:
	act = r['classification']['action']
	action_costs[act] = action_costs.get(act, 0) + r['resource']['Cost_Monthly']

	# 2. Service Breakdown (Cost by Service)
	service_costs = {}
	for r in results:
	svc = r['resource'].get('Service', 'Unknown')
	service_costs[svc] = service_costs.get(svc, 0) + r['resource']['Cost_Monthly']

	# Generate Action Chart HTML
	action_chart_html = ""
	action_colors = {"KEEP": "#10b981", "DELETE": "#ef4444", "HIBERNATE": "#f59e0b", "AUDIT": "#f97316"}
	max_action_cost = max(action_costs.values()) if action_costs.values() else 1

	for action in ["DELETE", "HIBERNATE", "KEEP", "AUDIT"]: # Order by importance
	cost = action_costs.get(action, 0)
	pct = (cost / max_action_cost * 100) if max_action_cost > 0 else 0
	color = action_colors.get(action, "#6b7280")
	action_chart_html += f"""
	<div style="margin-bottom: 12px;">
	<div style="display: flex; justify-content: space-between; margin-bottom: 4px;">
	<span style="font-size: 13px; font-weight: 600; color: #1f2937;">{action}</span>
	<span style="font-size: 13px; font-weight: 700; color: {color};">${cost:,.2f}</span>
	</div>
	<div style="background: #f3f4f6; border-radius: 4px; height: 24px; overflow: hidden;">
	<div style="background: {color}; height: 100%; width: {pct}%; transition: width 0.3s;"></div>
	</div>
	</div>
	"""

	# Generate Service Chart HTML
	service_chart_html = ""
	service_colors = {"EC2": "#ff9900", "RDS": "#3b82f6", "EBS": "#8b5cf6", "S3": "#10b981", "Lambda": "#f59e0b"}
	sorted_services = sorted(service_costs.items(), key=lambda x: x[1], reverse=True)
	max_service_cost = sorted_services[0][1] if sorted_services else 1

	for svc, cost in sorted_services:
	pct = (cost / max_service_cost * 100) if max_service_cost > 0 else 0
	color = service_colors.get(svc, "#6b7280")
	service_chart_html += f"""
	<div style="margin-bottom: 12px;">
	<div style="display: flex; justify-content: space-between; margin-bottom: 4px;">
	<span style="font-size: 13px; font-weight: 600; color: #1f2937;">{svc}</span>
	<span style="font-size: 13px; font-weight: 700; color: {color};">${cost:,.2f}</span>
	</div>
	<div style="background: #f3f4f6; border-radius: 4px; height: 24px; overflow: hidden;">
	<div style="background: {color}; height: 100%; width: {pct}%; transition: width 0.3s;"></div>
	</div>
	</div>
	"""

	# 3. Team Waste Leaderboard (Accountability Metrics)
	team_waste = {}
	for r in results:
	if r['classification']['action'] in ['DELETE', 'HIBERNATE']:
	team = r['classification'].get('inferred_team', 'Unknown')
	team_waste[team] = team_waste.get(team, 0) + r['resource']['Cost_Monthly']

	# Generate Team Waste Chart HTML
	team_chart_html = ""
	sorted_teams = sorted(team_waste.items(), key=lambda x: x[1], reverse=True)
	max_team_waste = sorted_teams[0][1] if sorted_teams else 1
	team_colors = ["#ef4444", "#f59e0b", "#f97316", "#fb923c", "#fdba74"]

	for idx, (team, waste) in enumerate(sorted_teams[:5]): # Top 5 teams
	pct = (waste / max_team_waste * 100) if max_team_waste > 0 else 0
	color = team_colors[idx] if idx < len(team_colors) else "#6b7280"
	team_chart_html += f"""
	<div style="margin-bottom: 12px;">
	<div style="display: flex; justify-content: space-between; margin-bottom: 4px;">
	<span style="font-size: 13px; font-weight: 600; color: #1f2937;">{team}</span>
	<span style="font-size: 13px; font-weight: 700; color: {color};">${waste:,.2f}</span>
	</div>
	<div style="background: #f3f4f6; border-radius: 4px; height: 24px; overflow: hidden;">
	<div style="background: {color}; height: 100%; width: {pct}%; transition: width 0.3s;"></div>
	</div>
	</div>
	"""

	report = """
	<div style="font-family: 'Inter', system-ui, sans-serif; color: #1f2937;">

	<!-- Badges -->
	<div style="display: flex; gap: 10px; margin-bottom: 20px;">
	<img src="https://img.shields.io/badge/Status-Complete-success?style=flat-square" alt="Status">
	<img src="https://img.shields.io/badge/Model-Gemini_2.0_Flash-blue?style=flat-square" alt="Model">
	<img src="https://img.shields.io/badge/Security-Enterprise_Grade-purple?style=flat-square" alt="Security">
	</div>

	<div style="font-family: 'Inter', system-ui, sans-serif; color: #1f2937;">

	<!-- CFO Scorecards -->
	<div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 20px; margin-bottom: 30px;">
	<!-- CARD 1: POTENTIAL SAVINGS (Most Important) -->
	<div style="background: white; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); border-left: 5px solid #10b981;">
	<h3 style="margin: 0 0 10px 0; color: #6b7280; font-size: 0.875rem; text-transform: uppercase; letter-spacing: 0.05em;">💰 Potential Savings</h3>
	<p style="margin: 0; font-size: 1.875rem; font-weight: 800; color: #10b981;">${savings:,.2f}</p>
	<p style="margin: 5px 0 0 0; font-size: 0.75rem; color: #9ca3af;">Delete + Hibernate</p>
	</div>

	<!-- CARD 2: TOTAL SPEND -->
	<div style="background: white; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);">
	<h3 style="margin: 0 0 10px 0; color: #6b7280; font-size: 0.875rem; text-transform: uppercase; letter-spacing: 0.05em;">Total Spend</h3>
	<p style="margin: 0; font-size: 1.5rem; font-weight: 700; color: #1f2937;">${total_spend:,.2f}</p>
	<p style="margin: 5px 0 0 0; font-size: 0.75rem; color: #9ca3af;">Analyzed: ${analyzed_spend:,.2f}</p>
	</div>

	<!-- CARD 3: ORPHAN SPEND -->
	<div style="background: white; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); border-left: 5px solid #f59e0b;">
	<h3 style="margin: 0 0 10px 0; color: #6b7280; font-size: 0.875rem; text-transform: uppercase; letter-spacing: 0.05em;">Orphan Spend</h3>
	<p style="margin: 0; font-size: 1.5rem; font-weight: 700; color: #1f2937;">${unallocated_cost:,.2f}</p>
	<p style="margin: 5px 0 0 0; font-size: 0.75rem; color: #9ca3af;">No Team Tag</p>
	</div>

	<!-- CARD 4: WASTE CPU -->
	<div style="background: white; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); border-left: 5px solid #3b82f6;">
	<h3 style="margin: 0 0 10px 0; color: #6b7280; font-size: 0.875rem; text-transform: uppercase; letter-spacing: 0.05em;">Waste CPU Avg</h3>
	<p style="margin: 0; font-size: 1.5rem; font-weight: 700; color: #1f2937;">{avg_waste_util:.1f}%</p>
	<p style="margin: 5px 0 0 0; font-size: 0.75rem; color: #9ca3af;">Idle Resources</p>
	</div>
	</div>

	<!-- DATA VISUALIZATIONS -->
	<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 20px; margin-bottom: 30px;">
	<!-- CHART 1: ACTION BREAKDOWN -->
	<div style="background: white; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);">
	<h3 style="margin: 0 0 15px 0; color: #1f2937; font-size: 1rem; font-weight: 700;">💼 Cost by Action</h3>
	{action_chart}
	</div>

	<!-- CHART 2: SERVICE BREAKDOWN -->
	<div style="background: white; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);">
	<h3 style="margin: 0 0 15px 0; color: #1f2937; font-size: 1rem; font-weight: 700;">🔧 Cost by Service</h3>
	{service_chart}
	</div>

	<!-- CHART 3: TEAM WASTE LEADERBOARD -->
	<div style="background: white; padding: 20px; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);">
	<h3 style="margin: 0 0 15px 0; color: #1f2937; font-size: 1rem; font-weight: 700;">🏆 Team Waste Ranking</h3>
	{team_chart}
	</div>
	</div>

	<h3>🛡️ Triage Details</h3>
	<div style="overflow: auto; max-height: 500px; border: 1px solid #e5e7eb; border-radius: 8px;">
	<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
	<thead style="position: sticky; top: 0; background: #f9fafb;">
	<tr style="border-bottom: 1px solid #e5e7eb;">
	<th style="padding: 12px; text-align: left;">ResourceID</th>
	<th style="padding: 12px; text-align: left;">Service</th>
	<th style="padding: 12px; text-align: right;">Cost</th>
	<th style="padding: 12px; text-align: left;">Action</th>
	<th style="padding: 12px; text-align: left;">Reason</th>
	</tr>
	</thead>
	<tbody>
	""".format(
	total_spend=total_spend,
	analyzed_spend=analyzed_spend,
	savings=savings,
	unallocated_cost=unallocated_cost,
	avg_waste_util=avg_waste_util,
	action_chart=action_chart_html,
	service_chart=service_chart_html,
	team_chart=team_chart_html
	)

	action_colors = {"KEEP": "#10b981", "DELETE": "#ef4444", "HIBERNATE": "#f59e0b", "AUDIT": "#f97316"}

	for result in results:
	res = result["resource"]
	cls = result["classification"]
	color = action_colors.get(cls['action'], "#6b7280")

	report += f"""
	<tr style="border-bottom: 1px solid #f3f4f6;">
	<td style="padding: 10px; font-family: monospace;">{res['ResourceID']}</td>
	<td style="padding: 10px;">{res['Service']}</td>
	<td style="padding: 10px; text-align: right;">${res['Cost_Monthly']:.2f}</td>
	<td style="padding: 10px;"><span style="color: {color}; font-weight: 700;">{cls['action']}</span></td>
	<td style="padding: 10px;">{cls['reason']}</td>
	</tr>
	"""
	report += "</tbody></table></div></div>"
	return report

	def generate_emails(results):
	"""Generates individual email drafts per team."""
	team_data = {}

	# Group resources by team
	for result in results:
	resource = result["resource"]
	classification = result["classification"]

	# Constraint: Do NOT generate emails for resources marked "KEEP"
	if classification["action"] == "KEEP":
	continue

	# 1. Trust AI first
	team_name = "Unknown"

	# Check if AI provided inferred_team (Pydantic model field)
	if classification.get("inferred_team"):
	val = classification["inferred_team"]
	# Handle Enum or String
	if hasattr(val, 'value'):
	team_name = val.value
	else:
	team_name = str(val)

	# 2. Fallback to existing Tag (if AI returned Unknown)
	if team_name == "Unknown":
	tags = json.loads(resource.get('Tags', '{}'))
	tag_team = get_tag(tags, ['team', 'owner_team'], "")
	if tag_team:
	team_name = tag_team.strip().title()

	# Grouping Logic
	if team_name not in team_data:
	team_data[team_name] = {'resources': [], 'poc_emails': set()}

	team_data[team_name]['resources'].append(result)

	# Extract POC email (use OwnerEmail as POC)
	poc_email = resource.get('OwnerEmail', '')
	if poc_email and '@' in poc_email:
	team_data[team_name]['poc_emails'].add(poc_email)

	# Generate individual email drafts
	if not team_data:
	return "# No Action Required\n\nAll resources are optimally configured. No emails to send."

	output = f"# 📧 Team Email Drafts ({len(team_data)} Teams)\n\n"
	output += "_Copy and paste each email below to send to the team POC_\n\n"
	output += "---\n\n"

	# Generate ONE email draft per Team
	for team in sorted(team_data.keys()):
	items = team_data[team]['resources']
	poc_emails = team_data[team]['poc_emails']

	# Calculate total savings for this team
	savings = sum(r['resource']['Cost_Monthly'] for r in items)

	# Count actions
	action_counts = {}
	for item in items:
	action = item['classification']['action']
	action_counts[action] = action_counts.get(action, 0) + 1

	# Email header
	output += f"## 📮 Email Draft for {team}\n\n"
	output += "```\n"
	output += f"To: {', '.join(sorted(poc_emails)) if poc_emails else 'team-email@company.com'}\n"
	output += f"Subject: [Action Required] Cloud FinOps Audit - ${savings:,.2f} Potential Savings\n"
	output += "\n"

	# Email body
	output += f"Dear {team},\n\n"
	output += f"Our automated FinOps audit has identified {len(items)} cloud resources under your team's ownership "
	output += f"that can be optimized to save ${savings:,.2f} per month.\n\n"

	output += "Summary:\n"
	action_summary = " \| ".join([f"{count} {action}" for action, count in sorted(action_counts.items())])
	output += f"- {len(items)} resources flagged: {action_summary}\n"
	output += f"- Total potential savings: ${savings:,.2f}/month\n\n"

	output += "Top Resources by Cost:\n\n"

	# Show top 5 resources in table format
	top_resources = sorted(items, key=lambda x: x['resource']['Cost_Monthly'], reverse=True)[:5]
	for i, item in enumerate(top_resources, 1):
	res = item['resource']
	cls = item['classification']
	action_emoji = {
	'DELETE': '🔴',
	'HIBERNATE': '🟡',
	'AUDIT': '🟠'
	}.get(cls['action'], '⚪')

	output += f"{i}. {res['ResourceID']} ({res['Service']})\n"
	output += f" Cost: ${res['Cost_Monthly']:.2f}/mo \| Action: {action_emoji} {cls['action']}\n"
	output += f" Reason: {cls['reason'][:80]}{'...' if len(cls['reason']) > 80 else ''}\n\n"

	if len(items) > 5:
	output += f"...and {len(items) - 5} more resources (see full report)\n\n"

	output += "Next Steps:\n"
	output += f"1. Review the flagged resources in the attached report\n"
	output += f"2. Confirm or reject the recommended actions\n"
	output += f"3. Contact finops@company.com with questions\n\n"

	output += "Best regards,\n"
	output += "Cloud FinOps Team\n"
	output += "```\n\n"
	output += "---\n\n"

	return output

	# --- 8. MAIN ORCHESTRATION ---

	def run_audit(custom_query="", progress_callback=None, log_callback=None):
	"""Main function called by app.py."""
	import time
	logger = ReasoningLogger(log_callback)

	try:
	logger.log("🚀", "Starting Sentinal Cloud Audit...")
	if progress_callback: progress_callback(0.1, "🚀 Initializing...")

	# Load Data
	if not os.path.exists("billing_export.csv"):
	return json.dumps({"report": "Error: billing_export.csv missing", "emails": ""})

	df = pd.read_csv("billing_export.csv")
	logger.log("📊", f"Loaded {len(df)} resources.")

	# --- STEP 1: INTENT EXTRACTION ---
	if custom_query:
	intent = extract_query_intent(custom_query, logger)
	df = filter_dataframe(df, intent, logger)

	if len(df) == 0:
	return json.dumps({"report": "<h3>No resources matched your query.</h3>", "emails": "", "reasoning": logger.get_logs()})

	# --- STEP 2: HYBRID ANALYSIS ---
	if progress_callback: progress_callback(0.3, "⚡ Running Python Heuristics...")
	simple_results, complex_cases = classify_simple_cases(df, logger)

	all_results = []
	for sr in simple_results:
	all_results.append({
	"resource": sr['resource'],
	"classification": {
	"action": sr['action'],
	"reason": sr['reason'],
	"policy_rule": sr['policy_rule'],
	"inferred_team": sr.get('inferred_team', TeamName.UNKNOWN)
	}
	})

	# --- STEP 3: AI REASONING ---
	if complex_cases:
	if progress_callback: progress_callback(0.6, f"🤖 AI Analyzing {len(complex_cases)} complex cases...")
	logger.log("📚", "Checking Policy via RAG...")
	policy_ctx = query_policy("cloud resource lifecycle policy including deletion, hibernation, and retention rules")

	# OPTIMIZATION: Prioritize Top 150 Most Expensive Resources
	# Instead of batching everything (slow/expensive), we focus AI on high-impact items.
	complex_cases.sort(key=lambda x: x.get('Cost_Monthly', 0), reverse=True)

	TOP_N = 100
	high_impact_batch = complex_cases[:TOP_N]
	skipped_count = len(complex_cases) - len(high_impact_batch)

	if skipped_count > 0:
	logger.log("⚡", f"Prioritizing Top {TOP_N} spenders. Skipped {skipped_count} low-cost items.")

	if progress_callback:
	progress_callback(0.7, f"🤖 AI Analyzing Top {len(high_impact_batch)} High-Impact Cases...")

	try:
	resources_json = json.dumps(high_impact_batch)
	ai_response = classify_resources(resources_json, policy_ctx)

	for cls in ai_response.classifications:
	res = next((r for r in high_impact_batch if r['ResourceID'] == cls.resource_id), None)
	if res:
	all_results.append({
	"resource": res,
	"classification": {
	"action": cls.action,
	"reason": cls.reason,
	"policy_rule": cls.policy_rule,
	"inferred_team": cls.inferred_team
	}
	})
	except Exception as e:
	logger.log("⚠️", f"AI Analysis Error: {e}")

	# --- STEP 4: SAFETY CHECKS ---
	logger.log("🛡️", "Applying final safety guardrails...")
	final_results = []
	for res in all_results:
	safe_action = apply_safety_rules(res['resource'], res['classification'], logger)
	res['classification']['action'] = safe_action
	final_results.append(res)

	# --- STEP 5: REPORTING ---
	if progress_callback: progress_callback(0.9, "📝 Generatng Report...")
	total_spend = df['Cost_Monthly'].sum()
	report = generate_report(final_results, len(df), total_spend)
	emails = generate_emails(final_results)

	logger.log("✅", "Audit Complete.")

	return json.dumps({
	"report": report,
	"emails": emails,
	"reasoning": logger.get_logs()
	})

	except Exception as e:
	import traceback
	traceback.print_exc()
	logger.log("❌", f"Fatal Error: {e}")
	return json.dumps({"report": f"Error: {e}", "emails": "", "reasoning": logger.get_logs()})

	if __name__ == "__main__":
	# Test run
	print(run_audit("find expensive databases"))