Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

File size: 7,373 Bytes

# hf_demo.py – ARF v4 dashboard for Hugging Face Spaces
import logging
from datetime import datetime, timezone

from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
import gradio as gr

# ARF v4 imports
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory
from agentic_reliability_framework.runtime.memory.constants import MemoryConstants

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="ARF v4 API with Memory")

# Enable CORS for your frontend
app.add_middleware(
    CORSMiddleware,
    allow_origins=["https://arf-frontend-sandy.vercel.app"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# ---------------------------------------------------------------------------
# Initialize ARF components
# ---------------------------------------------------------------------------
risk_engine = RiskEngine()

# Create FAISS index and memory (using default dimension from constants)
faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM)
memory = RAGGraphMemory(faiss_index)

# ---------------------------------------------------------------------------
# API Endpoints
# ---------------------------------------------------------------------------
@app.get("/")
async def root():
    return {
        "service": "ARF OSS API",
        "version": "4.0.0",
        "status": "operational",
        "memory_stats": memory.get_graph_stats() if memory.has_historical_data() else "empty",
    }

@app.get("/health")
async def health():
    return {"status": "ok", "version": "4.0.0"}

@app.get("/api/v1/get_risk")
async def get_risk():
    """
    Compute a safe risk snapshot using the supported RiskEngine.calculate_risk()
    API. This avoids calling the removed get_current_risk() method.
    """
    try:
        score = _calculate_demo_risk()
        return {
            "system_risk": score["risk"],
            "status": "critical" if score["risk"] > 0.8 else "normal",
            "details": score,
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/api/v1/incident")
async def store_incident(event_data: dict, analysis: dict):
    try:
        incident_id = memory.store_incident(event_data, analysis)
        return {"incident_id": incident_id}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/v1/memory/similar")
async def find_similar_incidents(action: str, k: int = 5):
    class DummyEvent:
        def __init__(self, action: str):
            self.component = "user_action"
            self.latency_p99 = 0.0
            self.error_rate = 0.0
            self.throughput = 0
            self.cpu_util = 0.0
            self.memory_util = 0.0
            self.timestamp = datetime.now()
            self.severity = "low"
            self.action = action

    event = DummyEvent(action)
    analysis = {"action": action}
    similar = memory.find_similar(event, analysis, k=k)

    results = []
    for node in similar:
        results.append(
            {
                "incident_id": node.incident_id,
                "component": node.component,
                "severity": node.severity,
                "timestamp": node.timestamp,
                "metrics": node.metrics,
                "agent_analysis": node.agent_analysis,
                "similarity_score": node.metadata.get("similarity_score", 0.0),
            }
        )

    return {"similar": results, "count": len(results)}

@app.get("/api/v1/memory/stats")
async def memory_stats():
    return memory.get_graph_stats()

# ---------------------------------------------------------------------------
# Gradio dashboard
# ---------------------------------------------------------------------------

class _DemoIntent:
    """
    Minimal intent object for demo-only risk snapshots.
    RiskEngine.categorize_intent() will fall back to DEFAULT for this object.
    """
    environment = "dev"
    deployment_target = "dev"
    service_name = "demo"

def _calculate_demo_risk():
    """
    Use the supported RiskEngine.calculate_risk() API.
    Avoids the removed get_current_risk() method.
    """
    intent = _DemoIntent()
    risk_value, explanation, contributions = risk_engine.calculate_risk(
        intent=intent,
        cost_estimate=None,
        policy_violations=[],
    )

    return {
        "risk": float(risk_value),
        "status": "critical" if risk_value > 0.8 else "normal",
        "explanation": explanation,
        "contributions": contributions,
    }

def get_risk_snapshot():
    try:
        snapshot = _calculate_demo_risk()
        snapshot["timestamp"] = datetime.now(timezone.utc).isoformat()
        return snapshot
    except Exception as e:
        logger.exception("Failed to compute risk snapshot")
        return {
            "status": "error",
            "error": str(e),
            "timestamp": datetime.now(timezone.utc).isoformat(),
        }

def get_health_snapshot():
    try:
        return {
            "status": "ok",
            "version": "4.0.0",
            "service": "ARF OSS API",
            "timestamp": datetime.now(timezone.utc).isoformat(),
        }
    except Exception as e:
        return {
            "status": "error",
            "error": str(e),
            "timestamp": datetime.now(timezone.utc).isoformat(),
        }

def get_memory_snapshot():
    try:
        if memory.has_historical_data():
            stats = memory.get_graph_stats()
            return {
                "status": "ok",
                "memory_stats": stats,
                "timestamp": datetime.now(timezone.utc).isoformat(),
            }
        return {
            "status": "empty",
            "memory_stats": "No historical memory yet.",
            "timestamp": datetime.now(timezone.utc).isoformat(),
        }
    except Exception as e:
        logger.exception("Failed to compute memory snapshot")
        return {
            "status": "error",
            "error": str(e),
            "timestamp": datetime.now(timezone.utc).isoformat(),
        }

with gr.Blocks(title="ARF v4 Demo") as demo:
    gr.Markdown("# Agentic Reliability Framework v4")
    gr.Markdown("### Status dashboard")

    with gr.Row():
        health_output = gr.JSON(label="Health")
        risk_output = gr.JSON(label="Current Risk")

    with gr.Row():
        memory_output = gr.JSON(label="Memory Stats")

    with gr.Row():
        refresh_btn = gr.Button("Refresh Risk")
        health_btn = gr.Button("Refresh Health")
        memory_btn = gr.Button("Refresh Memory")

    refresh_btn.click(fn=get_risk_snapshot, outputs=risk_output)
    health_btn.click(fn=get_health_snapshot, outputs=health_output)
    memory_btn.click(fn=get_memory_snapshot, outputs=memory_output)

    # Load initial state after startup, not during import.
    demo.load(fn=get_health_snapshot, outputs=health_output)
    demo.load(fn=get_risk_snapshot, outputs=risk_output)
    demo.load(fn=get_memory_snapshot, outputs=memory_output)

# ============== MAIN ENTRY POINT ==============
if __name__ == "__main__":
    # Launch Gradio directly to keep the Space alive and avoid the startup crash.
    demo.launch(server_name="0.0.0.0")