fix(emissions): default hardware to NVIDIA L4
Browse filesThe MI300X droplet was retired 2026-05-06; both inference Spaces
(msradam/riprap-vllm for Granite 4.1 8B FP8 + msradam/riprap-inference
for Prithvi / TerraMind / TTM / GLiNER / Embedding) now run on NVIDIA
L4 (24 GB, Ada Lovelace, 72 W TGP). Updates:
- app/emissions.py: HARDWARE adds nvidia_l4 (~60 W sustained per the
L4 data sheet) and reorders so it's the first/canonical entry. The
MI300X entry stays for operators who redeploy to that hardware and
set RIPRAP_HARDWARE_LABEL=AMD MI300X explicitly.
- app/llm.py:_hardware_for: when RIPRAP_LLM_BASE_URL is set (any
remote vLLM/Ollama backend), default to nvidia_l4 instead of MI300X.
RIPRAP_HARDWARE_LABEL override matrix expanded for l4 / t4 / mi300x.
- app/inference.py:_post: record nvidia_l4 by default; honor MI300X /
T4 overrides via RIPRAP_HARDWARE_LABEL.
Net effect on the briefing: a typical query (~700 LLM tokens + a few
ML calls on L4) now reports ~50-80 mWh instead of the inflated
600 mWh that the MI300X profile produced.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- app/emissions.py +20 -4
- app/inference.py +14 -5
- app/llm.py +16 -5
|
@@ -30,24 +30,40 @@ from typing import Any
|
|
| 30 |
|
| 31 |
# (label, sustained_power_w, source)
|
| 32 |
HARDWARE: dict[str, tuple[str, float, str]] = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
"amd_mi300x": (
|
| 34 |
"AMD MI300X",
|
| 35 |
600.0,
|
| 36 |
"AMD Instinct MI300X data sheet (750 W TDP); ~600 W sustained "
|
| 37 |
-
"during vLLM generation
|
| 38 |
-
"
|
|
|
|
|
|
|
|
|
|
| 39 |
),
|
| 40 |
"nvidia_t4": (
|
| 41 |
"NVIDIA T4",
|
| 42 |
50.0,
|
| 43 |
"NVIDIA T4 data sheet (70 W max); ~50 W sustained during "
|
| 44 |
-
"transformer inference."
|
|
|
|
|
|
|
| 45 |
),
|
| 46 |
"apple_m": (
|
| 47 |
"Apple M-series",
|
| 48 |
20.0,
|
| 49 |
"ml.energy / community measurements: ~20 W package power "
|
| 50 |
-
"during Granite 4.1 q4_K_M inference on Apple M3/M4
|
|
|
|
| 51 |
),
|
| 52 |
"cpu_server": (
|
| 53 |
"x86 CPU",
|
|
|
|
| 30 |
|
| 31 |
# (label, sustained_power_w, source)
|
| 32 |
HARDWARE: dict[str, tuple[str, float, str]] = {
|
| 33 |
+
"nvidia_l4": (
|
| 34 |
+
"NVIDIA L4",
|
| 35 |
+
60.0,
|
| 36 |
+
"NVIDIA L4 Tensor Core GPU data sheet (72 W TGP, Ada Lovelace, "
|
| 37 |
+
"24 GB); ~60 W sustained during transformer inference. The "
|
| 38 |
+
"active backend for both Riprap inference Spaces — "
|
| 39 |
+
"msradam/riprap-vllm for Granite 4.1 8B FP8 (vLLM), and "
|
| 40 |
+
"msradam/riprap-inference for Prithvi-EO / TerraMind / "
|
| 41 |
+
"Granite TTM / GLiNER / Granite Embedding.",
|
| 42 |
+
),
|
| 43 |
"amd_mi300x": (
|
| 44 |
"AMD MI300X",
|
| 45 |
600.0,
|
| 46 |
"AMD Instinct MI300X data sheet (750 W TDP); ~600 W sustained "
|
| 47 |
+
"during vLLM generation. Selected only when an operator deploys "
|
| 48 |
+
"against an MI300X droplet and sets RIPRAP_HARDWARE_LABEL=AMD "
|
| 49 |
+
"MI300X explicitly. The hackathon submission used to run on "
|
| 50 |
+
"this hardware; the droplet was decommissioned 2026-05-06 and "
|
| 51 |
+
"inference now routes through L4 Spaces.",
|
| 52 |
),
|
| 53 |
"nvidia_t4": (
|
| 54 |
"NVIDIA T4",
|
| 55 |
50.0,
|
| 56 |
"NVIDIA T4 data sheet (70 W max); ~50 W sustained during "
|
| 57 |
+
"transformer inference. Used by the CPU-tier UI Spaces "
|
| 58 |
+
"(lablab + personal mirror) when a small inline LLM runs "
|
| 59 |
+
"alongside the FastAPI front-end.",
|
| 60 |
),
|
| 61 |
"apple_m": (
|
| 62 |
"Apple M-series",
|
| 63 |
20.0,
|
| 64 |
"ml.energy / community measurements: ~20 W package power "
|
| 65 |
+
"during Granite 4.1 q4_K_M inference on Apple M3/M4 (the "
|
| 66 |
+
"local-dev path, no remote backend configured).",
|
| 67 |
),
|
| 68 |
"cpu_server": (
|
| 69 |
"x86 CPU",
|
|
@@ -95,14 +95,23 @@ def _post(path: str, payload: dict[str, Any], timeout: float | None = None) -> d
|
|
| 95 |
raise RemoteUnreachable(f"HTTP {r.status_code} from {path}: {r.text[:200]}")
|
| 96 |
r.raise_for_status()
|
| 97 |
duration_s = time.monotonic() - t0
|
| 98 |
-
#
|
| 99 |
-
#
|
| 100 |
-
#
|
| 101 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
emissions.active().record_ml(
|
| 103 |
endpoint=path,
|
| 104 |
backend="riprap-models",
|
| 105 |
-
hardware=
|
| 106 |
duration_s=duration_s,
|
| 107 |
)
|
| 108 |
return r.json()
|
|
|
|
| 95 |
raise RemoteUnreachable(f"HTTP {r.status_code} from {path}: {r.text[:200]}")
|
| 96 |
r.raise_for_status()
|
| 97 |
duration_s = time.monotonic() - t0
|
| 98 |
+
# Remote ML service is msradam/riprap-inference (or the vLLM-co-
|
| 99 |
+
# hosting msradam/riprap-vllm) — both run on NVIDIA L4 HF Spaces.
|
| 100 |
+
# Operators can override via RIPRAP_HARDWARE_LABEL when targeting
|
| 101 |
+
# different hardware (e.g. an MI300X droplet). Local-fallback paths
|
| 102 |
+
# don't reach this function — they go straight to in-process model
|
| 103 |
+
# loads in the specialist module, which we don't track.
|
| 104 |
+
override = (os.environ.get("RIPRAP_HARDWARE_LABEL") or "").lower()
|
| 105 |
+
if "mi300x" in override or "amd" in override:
|
| 106 |
+
hw = "amd_mi300x"
|
| 107 |
+
elif "t4" in override:
|
| 108 |
+
hw = "nvidia_t4"
|
| 109 |
+
else:
|
| 110 |
+
hw = "nvidia_l4"
|
| 111 |
emissions.active().record_ml(
|
| 112 |
endpoint=path,
|
| 113 |
backend="riprap-models",
|
| 114 |
+
hardware=hw,
|
| 115 |
duration_s=duration_s,
|
| 116 |
)
|
| 117 |
return r.json()
|
|
@@ -237,17 +237,28 @@ def _hardware_for(engine: str) -> str:
|
|
| 237 |
"""Map the active LLM engine to an emissions.HARDWARE key.
|
| 238 |
|
| 239 |
Operator override via RIPRAP_HARDWARE_LABEL is honored where it
|
| 240 |
-
matches a known key (mi300x / t4 / apple / cpu); otherwise
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
override = (os.environ.get("RIPRAP_HARDWARE_LABEL") or "").lower()
|
| 243 |
if "mi300x" in override or "amd" in override:
|
| 244 |
return "amd_mi300x"
|
| 245 |
-
if "
|
|
|
|
|
|
|
| 246 |
return "nvidia_t4"
|
|
|
|
|
|
|
| 247 |
if "apple" in override or "m3" in override or "m4" in override:
|
| 248 |
return "apple_m"
|
| 249 |
-
if
|
| 250 |
-
|
|
|
|
| 251 |
if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"):
|
| 252 |
return "nvidia_t4"
|
| 253 |
return "apple_m"
|
|
|
|
| 237 |
"""Map the active LLM engine to an emissions.HARDWARE key.
|
| 238 |
|
| 239 |
Operator override via RIPRAP_HARDWARE_LABEL is honored where it
|
| 240 |
+
matches a known key (mi300x / l4 / t4 / apple / cpu); otherwise:
|
| 241 |
+
- Remote vLLM/Ollama (RIPRAP_LLM_BASE_URL set) → NVIDIA L4. Both
|
| 242 |
+
Riprap inference Spaces (msradam/riprap-vllm + msradam/
|
| 243 |
+
riprap-inference) run on L4. The MI300X droplet was retired
|
| 244 |
+
2026-05-06.
|
| 245 |
+
- On a CPU/T4-tier HF Space (UI Space with no remote backend) →
|
| 246 |
+
T4.
|
| 247 |
+
- Otherwise local dev → Apple M-series."""
|
| 248 |
override = (os.environ.get("RIPRAP_HARDWARE_LABEL") or "").lower()
|
| 249 |
if "mi300x" in override or "amd" in override:
|
| 250 |
return "amd_mi300x"
|
| 251 |
+
if "l4" in override:
|
| 252 |
+
return "nvidia_l4"
|
| 253 |
+
if "t4" in override:
|
| 254 |
return "nvidia_t4"
|
| 255 |
+
if "nvidia" in override:
|
| 256 |
+
return "nvidia_l4"
|
| 257 |
if "apple" in override or "m3" in override or "m4" in override:
|
| 258 |
return "apple_m"
|
| 259 |
+
if _VLLM_BASE:
|
| 260 |
+
# Any remote vLLM/Ollama backend currently lives on an L4 Space.
|
| 261 |
+
return "nvidia_l4"
|
| 262 |
if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"):
|
| 263 |
return "nvidia_t4"
|
| 264 |
return "apple_m"
|