Spaces:

lablab-ai-amd-developer-hackathon
/

riprap-nyc

Running

File size: 2,543 Bytes

76f3ae6

"""Per-query energy footprint estimate.

Conservative, defensible numbers — no overclaim. We measure local
inference time and apply a published-range package-power figure for
Apple-Silicon LLM inference; we compare to the most recent published
estimate of frontier-cloud per-query energy (Epoch AI, 2025).

This is not a benchmark — it's a transparent rule-of-thumb that the
user can audit. The system prompt and the UI both surface the
underlying numbers and the citation.
"""
from __future__ import annotations

# Local: Granite 4.1:3b on Apple M-series (M3/M4 Pro range)
# Sustained package power during ~5 s of LLM inference, q4_K_M quant.
# Source: ml.energy + community measurements; conservative midpoint.
LOCAL_PACKAGE_POWER_W = 20.0

# Frontier cloud per-query inference energy.
# Source: Epoch AI, "How much energy does ChatGPT use?" (2025).
# https://epoch.ai/gradient-updates/how-much-energy-does-chatgpt-use
# This is a typical-query estimate for GPT-4o-class inference; long-context
# queries scale roughly linearly with token count.
CLOUD_PER_QUERY_WH = 0.30

# Citation strings used in the UI.
LOCAL_SOURCE = ("ml.energy / community measurements; ~20 W package power "
                "during Granite 4.1:3b q4_K_M inference on Apple M-series.")
CLOUD_SOURCE = ('Epoch AI (2025), "How much energy does ChatGPT use?", '
                "estimating ~0.3 Wh per typical GPT-4o query.")


def estimate(reconcile_seconds: float, total_seconds: float | None = None) -> dict:
    """Return a per-query energy estimate.

    Args:
      reconcile_seconds: wallclock of the Granite reconcile step (the
        only step that meaningfully draws CPU/GPU power).
      total_seconds: optional full-FSM wallclock for context.
    """
    local_wh = LOCAL_PACKAGE_POWER_W * reconcile_seconds / 3600.0
    return {
        "local_wh": round(local_wh, 4),
        "local_mwh": round(local_wh * 1000, 1),
        "cloud_wh": CLOUD_PER_QUERY_WH,
        "cloud_mwh": round(CLOUD_PER_QUERY_WH * 1000, 1),
        "ratio_cloud_over_local": round(CLOUD_PER_QUERY_WH / local_wh, 1) if local_wh > 0 else None,
        "method": {
            "local": f"{LOCAL_PACKAGE_POWER_W} W × {reconcile_seconds:.2f} s ÷ 3600",
            "local_source": LOCAL_SOURCE,
            "cloud": f"{CLOUD_PER_QUERY_WH} Wh per query (published estimate)",
            "cloud_source": CLOUD_SOURCE,
        },
        "reconcile_seconds": round(reconcile_seconds, 2),
        "total_seconds": round(total_seconds, 2) if total_seconds is not None else None,
    }