seriffic Claude Opus 4.7 (1M context) commited on
Commit
d48454d
·
1 Parent(s): b84be35

fix(emissions): default hardware to NVIDIA L4

Browse files

The MI300X droplet was retired 2026-05-06; both inference Spaces
(msradam/riprap-vllm for Granite 4.1 8B FP8 + msradam/riprap-inference
for Prithvi / TerraMind / TTM / GLiNER / Embedding) now run on NVIDIA
L4 (24 GB, Ada Lovelace, 72 W TGP). Updates:

- app/emissions.py: HARDWARE adds nvidia_l4 (~60 W sustained per the
L4 data sheet) and reorders so it's the first/canonical entry. The
MI300X entry stays for operators who redeploy to that hardware and
set RIPRAP_HARDWARE_LABEL=AMD MI300X explicitly.
- app/llm.py:_hardware_for: when RIPRAP_LLM_BASE_URL is set (any
remote vLLM/Ollama backend), default to nvidia_l4 instead of MI300X.
RIPRAP_HARDWARE_LABEL override matrix expanded for l4 / t4 / mi300x.
- app/inference.py:_post: record nvidia_l4 by default; honor MI300X /
T4 overrides via RIPRAP_HARDWARE_LABEL.

Net effect on the briefing: a typical query (~700 LLM tokens + a few
ML calls on L4) now reports ~50-80 mWh instead of the inflated
600 mWh that the MI300X profile produced.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show
  1. app/emissions.py +20 -4
  2. app/inference.py +14 -5
  3. app/llm.py +16 -5
app/emissions.py CHANGED
@@ -30,24 +30,40 @@ from typing import Any
30
 
31
  # (label, sustained_power_w, source)
32
  HARDWARE: dict[str, tuple[str, float, str]] = {
 
 
 
 
 
 
 
 
 
 
33
  "amd_mi300x": (
34
  "AMD MI300X",
35
  600.0,
36
  "AMD Instinct MI300X data sheet (750 W TDP); ~600 W sustained "
37
- "during vLLM generation is a conservative midpoint of public "
38
- "ROCm benchmarks.",
 
 
 
39
  ),
40
  "nvidia_t4": (
41
  "NVIDIA T4",
42
  50.0,
43
  "NVIDIA T4 data sheet (70 W max); ~50 W sustained during "
44
- "transformer inference.",
 
 
45
  ),
46
  "apple_m": (
47
  "Apple M-series",
48
  20.0,
49
  "ml.energy / community measurements: ~20 W package power "
50
- "during Granite 4.1 q4_K_M inference on Apple M3/M4.",
 
51
  ),
52
  "cpu_server": (
53
  "x86 CPU",
 
30
 
31
  # (label, sustained_power_w, source)
32
  HARDWARE: dict[str, tuple[str, float, str]] = {
33
+ "nvidia_l4": (
34
+ "NVIDIA L4",
35
+ 60.0,
36
+ "NVIDIA L4 Tensor Core GPU data sheet (72 W TGP, Ada Lovelace, "
37
+ "24 GB); ~60 W sustained during transformer inference. The "
38
+ "active backend for both Riprap inference Spaces — "
39
+ "msradam/riprap-vllm for Granite 4.1 8B FP8 (vLLM), and "
40
+ "msradam/riprap-inference for Prithvi-EO / TerraMind / "
41
+ "Granite TTM / GLiNER / Granite Embedding.",
42
+ ),
43
  "amd_mi300x": (
44
  "AMD MI300X",
45
  600.0,
46
  "AMD Instinct MI300X data sheet (750 W TDP); ~600 W sustained "
47
+ "during vLLM generation. Selected only when an operator deploys "
48
+ "against an MI300X droplet and sets RIPRAP_HARDWARE_LABEL=AMD "
49
+ "MI300X explicitly. The hackathon submission used to run on "
50
+ "this hardware; the droplet was decommissioned 2026-05-06 and "
51
+ "inference now routes through L4 Spaces.",
52
  ),
53
  "nvidia_t4": (
54
  "NVIDIA T4",
55
  50.0,
56
  "NVIDIA T4 data sheet (70 W max); ~50 W sustained during "
57
+ "transformer inference. Used by the CPU-tier UI Spaces "
58
+ "(lablab + personal mirror) when a small inline LLM runs "
59
+ "alongside the FastAPI front-end.",
60
  ),
61
  "apple_m": (
62
  "Apple M-series",
63
  20.0,
64
  "ml.energy / community measurements: ~20 W package power "
65
+ "during Granite 4.1 q4_K_M inference on Apple M3/M4 (the "
66
+ "local-dev path, no remote backend configured).",
67
  ),
68
  "cpu_server": (
69
  "x86 CPU",
app/inference.py CHANGED
@@ -95,14 +95,23 @@ def _post(path: str, payload: dict[str, Any], timeout: float | None = None) -> d
95
  raise RemoteUnreachable(f"HTTP {r.status_code} from {path}: {r.text[:200]}")
96
  r.raise_for_status()
97
  duration_s = time.monotonic() - t0
98
- # The remote ML service runs alongside vLLM on the AMD MI300X
99
- # droplet; attribute the wallclock to that hardware. Local-fallback
100
- # paths don't reach this function they go straight to in-process
101
- # model loads in the specialist module, which we don't track.
 
 
 
 
 
 
 
 
 
102
  emissions.active().record_ml(
103
  endpoint=path,
104
  backend="riprap-models",
105
- hardware="amd_mi300x",
106
  duration_s=duration_s,
107
  )
108
  return r.json()
 
95
  raise RemoteUnreachable(f"HTTP {r.status_code} from {path}: {r.text[:200]}")
96
  r.raise_for_status()
97
  duration_s = time.monotonic() - t0
98
+ # Remote ML service is msradam/riprap-inference (or the vLLM-co-
99
+ # hosting msradam/riprap-vllm) both run on NVIDIA L4 HF Spaces.
100
+ # Operators can override via RIPRAP_HARDWARE_LABEL when targeting
101
+ # different hardware (e.g. an MI300X droplet). Local-fallback paths
102
+ # don't reach this function — they go straight to in-process model
103
+ # loads in the specialist module, which we don't track.
104
+ override = (os.environ.get("RIPRAP_HARDWARE_LABEL") or "").lower()
105
+ if "mi300x" in override or "amd" in override:
106
+ hw = "amd_mi300x"
107
+ elif "t4" in override:
108
+ hw = "nvidia_t4"
109
+ else:
110
+ hw = "nvidia_l4"
111
  emissions.active().record_ml(
112
  endpoint=path,
113
  backend="riprap-models",
114
+ hardware=hw,
115
  duration_s=duration_s,
116
  )
117
  return r.json()
app/llm.py CHANGED
@@ -237,17 +237,28 @@ def _hardware_for(engine: str) -> str:
237
  """Map the active LLM engine to an emissions.HARDWARE key.
238
 
239
  Operator override via RIPRAP_HARDWARE_LABEL is honored where it
240
- matches a known key (mi300x / t4 / apple / cpu); otherwise we
241
- infer from engine selection and HF Space presence."""
 
 
 
 
 
 
242
  override = (os.environ.get("RIPRAP_HARDWARE_LABEL") or "").lower()
243
  if "mi300x" in override or "amd" in override:
244
  return "amd_mi300x"
245
- if "t4" in override or "nvidia" in override:
 
 
246
  return "nvidia_t4"
 
 
247
  if "apple" in override or "m3" in override or "m4" in override:
248
  return "apple_m"
249
- if engine == "vLLM" and _VLLM_BASE:
250
- return "amd_mi300x"
 
251
  if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"):
252
  return "nvidia_t4"
253
  return "apple_m"
 
237
  """Map the active LLM engine to an emissions.HARDWARE key.
238
 
239
  Operator override via RIPRAP_HARDWARE_LABEL is honored where it
240
+ matches a known key (mi300x / l4 / t4 / apple / cpu); otherwise:
241
+ - Remote vLLM/Ollama (RIPRAP_LLM_BASE_URL set) NVIDIA L4. Both
242
+ Riprap inference Spaces (msradam/riprap-vllm + msradam/
243
+ riprap-inference) run on L4. The MI300X droplet was retired
244
+ 2026-05-06.
245
+ - On a CPU/T4-tier HF Space (UI Space with no remote backend) →
246
+ T4.
247
+ - Otherwise local dev → Apple M-series."""
248
  override = (os.environ.get("RIPRAP_HARDWARE_LABEL") or "").lower()
249
  if "mi300x" in override or "amd" in override:
250
  return "amd_mi300x"
251
+ if "l4" in override:
252
+ return "nvidia_l4"
253
+ if "t4" in override:
254
  return "nvidia_t4"
255
+ if "nvidia" in override:
256
+ return "nvidia_l4"
257
  if "apple" in override or "m3" in override or "m4" in override:
258
  return "apple_m"
259
+ if _VLLM_BASE:
260
+ # Any remote vLLM/Ollama backend currently lives on an L4 Space.
261
+ return "nvidia_l4"
262
  if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"):
263
  return "nvidia_t4"
264
  return "apple_m"