feat: AMD Live Inference Dashboard — show latency, throughput, token count after each analysis
Browse files- model_loader.generate_response() now measures wall-clock latency with time.perf_counter()
and extracts completion_tokens/total_tokens from the OpenAI usage object
- returns (text, metrics) tuple; metrics = {latency_ms, total_tokens, tokens_per_sec}
- agent.analyze_image_and_text() passes metrics through as result['_metrics']
- app.py: new _metrics_bar() renders 3 chips (Latency / Throughput / Tokens) above diagnosis card
- metrics labels are fully i18n'd across all 6 languages
- app.py +57 -1
- src/agent.py +7 -2
- src/model_loader.py +23 -3
app.py
CHANGED
|
@@ -49,6 +49,9 @@ _I18N = {
|
|
| 49 |
"error_title": "Backend Unavailable",
|
| 50 |
"error_body": "AMD Cloud backend is unreachable. Please try again later.",
|
| 51 |
"examples_label": "Quick Examples",
|
|
|
|
|
|
|
|
|
|
| 52 |
},
|
| 53 |
"vn": {
|
| 54 |
"img_label": "Tải lên hình ảnh y tế",
|
|
@@ -68,6 +71,9 @@ _I18N = {
|
|
| 68 |
"error_title": "Hệ thống không khả dụng",
|
| 69 |
"error_body": "Không thể kết nối AMD Cloud. Vui lòng thử lại sau.",
|
| 70 |
"examples_label": "Ví dụ nhanh",
|
|
|
|
|
|
|
|
|
|
| 71 |
},
|
| 72 |
"zh": {
|
| 73 |
"img_label": "上传医学图像",
|
|
@@ -87,6 +93,9 @@ _I18N = {
|
|
| 87 |
"error_title": "后端不可用",
|
| 88 |
"error_body": "AMD Cloud 后端无法访问,请稍后重试。",
|
| 89 |
"examples_label": "快速示例",
|
|
|
|
|
|
|
|
|
|
| 90 |
},
|
| 91 |
"es": {
|
| 92 |
"img_label": "Subir imagen médica",
|
|
@@ -106,6 +115,9 @@ _I18N = {
|
|
| 106 |
"error_title": "Backend no disponible",
|
| 107 |
"error_body": "El backend de AMD Cloud no está disponible. Por favor, inténtelo más tarde.",
|
| 108 |
"examples_label": "Ejemplos rápidos",
|
|
|
|
|
|
|
|
|
|
| 109 |
},
|
| 110 |
"fr": {
|
| 111 |
"img_label": "Télécharger une image médicale",
|
|
@@ -125,6 +137,9 @@ _I18N = {
|
|
| 125 |
"error_title": "Backend indisponible",
|
| 126 |
"error_body": "Le backend AMD Cloud est inaccessible. Veuillez réessayer plus tard.",
|
| 127 |
"examples_label": "Exemples rapides",
|
|
|
|
|
|
|
|
|
|
| 128 |
},
|
| 129 |
"ja": {
|
| 130 |
"img_label": "医療画像をアップロード",
|
|
@@ -144,6 +159,9 @@ _I18N = {
|
|
| 144 |
"error_title": "バックエンド利用不可",
|
| 145 |
"error_body": "AMD Cloudバックエンドに接続できません。後でもう一度お試しください。",
|
| 146 |
"examples_label": "クイック例",
|
|
|
|
|
|
|
|
|
|
| 147 |
},
|
| 148 |
}
|
| 149 |
|
|
@@ -208,6 +226,41 @@ def _severity_badge(severity: str) -> str:
|
|
| 208 |
)
|
| 209 |
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
def _confidence_bar(score: int, label: str) -> str:
|
| 212 |
if score == 0:
|
| 213 |
return ""
|
|
@@ -240,6 +293,7 @@ def _build_result_html(result: dict, lang: str) -> str:
|
|
| 240 |
sev = _SEVERITY_TRANSLATE.get(lang, _SEVERITY_TRANSLATE["en"]).get(sev_en, sev_en)
|
| 241 |
actions = result.get("recommended_actions", [])
|
| 242 |
score = result.get("confidence_score", 0)
|
|
|
|
| 243 |
|
| 244 |
actions_html = "".join(
|
| 245 |
f"<li style='margin:5px 0; color:#d1d5db;'>{a}</li>" for a in actions
|
|
@@ -255,7 +309,7 @@ def _build_result_html(result: dict, lang: str) -> str:
|
|
| 255 |
<div style='background:#111827; border:1px solid #ED1C24; border-radius:12px;
|
| 256 |
padding:20px; font-family:Arial,sans-serif; color:#f9fafb;'>
|
| 257 |
|
| 258 |
-
<div style='display:flex; align-items:center; gap:10px; margin-bottom:
|
| 259 |
<div style='background:#ED1C24; width:4px; border-radius:2px; height:36px;'></div>
|
| 260 |
<div>
|
| 261 |
<div style='font-size:1.1rem; font-weight:700; color:#ED1C24;'>
|
|
@@ -265,6 +319,8 @@ def _build_result_html(result: dict, lang: str) -> str:
|
|
| 265 |
</div>
|
| 266 |
</div>
|
| 267 |
|
|
|
|
|
|
|
| 268 |
<div style='background:#1f2937; border-radius:8px; padding:14px; margin-bottom:12px;'>
|
| 269 |
<div style='font-size:0.75rem; text-transform:uppercase; letter-spacing:.05em;
|
| 270 |
color:#9ca3af; margin-bottom:6px;'>{t['diag_label']}</div>
|
|
|
|
| 49 |
"error_title": "Backend Unavailable",
|
| 50 |
"error_body": "AMD Cloud backend is unreachable. Please try again later.",
|
| 51 |
"examples_label": "Quick Examples",
|
| 52 |
+
"metrics_latency": "Latency",
|
| 53 |
+
"metrics_throughput": "Throughput",
|
| 54 |
+
"metrics_tokens": "tokens",
|
| 55 |
},
|
| 56 |
"vn": {
|
| 57 |
"img_label": "Tải lên hình ảnh y tế",
|
|
|
|
| 71 |
"error_title": "Hệ thống không khả dụng",
|
| 72 |
"error_body": "Không thể kết nối AMD Cloud. Vui lòng thử lại sau.",
|
| 73 |
"examples_label": "Ví dụ nhanh",
|
| 74 |
+
"metrics_latency": "Độ trễ",
|
| 75 |
+
"metrics_throughput": "Thông lượng",
|
| 76 |
+
"metrics_tokens": "token",
|
| 77 |
},
|
| 78 |
"zh": {
|
| 79 |
"img_label": "上传医学图像",
|
|
|
|
| 93 |
"error_title": "后端不可用",
|
| 94 |
"error_body": "AMD Cloud 后端无法访问,请稍后重试。",
|
| 95 |
"examples_label": "快速示例",
|
| 96 |
+
"metrics_latency": "延迟",
|
| 97 |
+
"metrics_throughput": "吞吐量",
|
| 98 |
+
"metrics_tokens": "tokens",
|
| 99 |
},
|
| 100 |
"es": {
|
| 101 |
"img_label": "Subir imagen médica",
|
|
|
|
| 115 |
"error_title": "Backend no disponible",
|
| 116 |
"error_body": "El backend de AMD Cloud no está disponible. Por favor, inténtelo más tarde.",
|
| 117 |
"examples_label": "Ejemplos rápidos",
|
| 118 |
+
"metrics_latency": "Latencia",
|
| 119 |
+
"metrics_throughput": "Rendimiento",
|
| 120 |
+
"metrics_tokens": "tokens",
|
| 121 |
},
|
| 122 |
"fr": {
|
| 123 |
"img_label": "Télécharger une image médicale",
|
|
|
|
| 137 |
"error_title": "Backend indisponible",
|
| 138 |
"error_body": "Le backend AMD Cloud est inaccessible. Veuillez réessayer plus tard.",
|
| 139 |
"examples_label": "Exemples rapides",
|
| 140 |
+
"metrics_latency": "Latence",
|
| 141 |
+
"metrics_throughput": "Débit",
|
| 142 |
+
"metrics_tokens": "tokens",
|
| 143 |
},
|
| 144 |
"ja": {
|
| 145 |
"img_label": "医療画像をアップロード",
|
|
|
|
| 159 |
"error_title": "バックエンド利用不可",
|
| 160 |
"error_body": "AMD Cloudバックエンドに接続できません。後でもう一度お試しください。",
|
| 161 |
"examples_label": "クイック例",
|
| 162 |
+
"metrics_latency": "レイテンシ",
|
| 163 |
+
"metrics_throughput": "スループット",
|
| 164 |
+
"metrics_tokens": "トークン",
|
| 165 |
},
|
| 166 |
}
|
| 167 |
|
|
|
|
| 226 |
)
|
| 227 |
|
| 228 |
|
| 229 |
+
def _metrics_bar(metrics: dict, t: dict) -> str:
|
| 230 |
+
latency_ms = metrics.get("latency_ms", 0)
|
| 231 |
+
tok_per_sec = metrics.get("tokens_per_sec", 0)
|
| 232 |
+
total_tokens = metrics.get("total_tokens", 0)
|
| 233 |
+
|
| 234 |
+
def _chip(label: str, value: str) -> str:
|
| 235 |
+
return (
|
| 236 |
+
f"<span style='display:inline-flex; flex-direction:column; align-items:center; "
|
| 237 |
+
f"background:#0f172a; border:1px solid #374151; border-radius:8px; "
|
| 238 |
+
f"padding:6px 14px; min-width:80px;'>"
|
| 239 |
+
f"<span style='font-size:1rem; font-weight:700; color:#ED1C24;'>{value}</span>"
|
| 240 |
+
f"<span style='font-size:0.65rem; color:#6b7280; text-transform:uppercase; "
|
| 241 |
+
f"letter-spacing:.05em; margin-top:2px;'>{label}</span>"
|
| 242 |
+
f"</span>"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
latency_val = f"{latency_ms:,} ms" if latency_ms else "—"
|
| 246 |
+
throughput_val = f"{tok_per_sec} {t['metrics_tokens']}/s" if tok_per_sec else "—"
|
| 247 |
+
tokens_val = f"{total_tokens:,} {t['metrics_tokens']}" if total_tokens else "—"
|
| 248 |
+
|
| 249 |
+
return (
|
| 250 |
+
f"<div style='display:flex; gap:8px; flex-wrap:wrap; align-items:center; "
|
| 251 |
+
f"justify-content:space-between; background:#1f2937; border-radius:8px; "
|
| 252 |
+
f"padding:10px 14px; margin-bottom:12px;'>"
|
| 253 |
+
f"<div style='font-size:0.68rem; color:#6b7280; font-family:monospace; "
|
| 254 |
+
f"white-space:nowrap;'>⚡ AMD MI300X · ROCm · vLLM</div>"
|
| 255 |
+
f"<div style='display:flex; gap:8px; flex-wrap:wrap;'>"
|
| 256 |
+
f"{_chip(t['metrics_latency'], latency_val)}"
|
| 257 |
+
f"{_chip(t['metrics_throughput'], throughput_val)}"
|
| 258 |
+
f"{_chip(t['metrics_tokens'], tokens_val)}"
|
| 259 |
+
f"</div>"
|
| 260 |
+
f"</div>"
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
def _confidence_bar(score: int, label: str) -> str:
|
| 265 |
if score == 0:
|
| 266 |
return ""
|
|
|
|
| 293 |
sev = _SEVERITY_TRANSLATE.get(lang, _SEVERITY_TRANSLATE["en"]).get(sev_en, sev_en)
|
| 294 |
actions = result.get("recommended_actions", [])
|
| 295 |
score = result.get("confidence_score", 0)
|
| 296 |
+
metrics = result.get("_metrics", {})
|
| 297 |
|
| 298 |
actions_html = "".join(
|
| 299 |
f"<li style='margin:5px 0; color:#d1d5db;'>{a}</li>" for a in actions
|
|
|
|
| 309 |
<div style='background:#111827; border:1px solid #ED1C24; border-radius:12px;
|
| 310 |
padding:20px; font-family:Arial,sans-serif; color:#f9fafb;'>
|
| 311 |
|
| 312 |
+
<div style='display:flex; align-items:center; gap:10px; margin-bottom:12px;'>
|
| 313 |
<div style='background:#ED1C24; width:4px; border-radius:2px; height:36px;'></div>
|
| 314 |
<div>
|
| 315 |
<div style='font-size:1.1rem; font-weight:700; color:#ED1C24;'>
|
|
|
|
| 319 |
</div>
|
| 320 |
</div>
|
| 321 |
|
| 322 |
+
{_metrics_bar(metrics, t)}
|
| 323 |
+
|
| 324 |
<div style='background:#1f2937; border-radius:8px; padding:14px; margin-bottom:12px;'>
|
| 325 |
<div style='font-size:0.75rem; text-transform:uppercase; letter-spacing:.05em;
|
| 326 |
color:#9ca3af; margin-bottom:6px;'>{t['diag_label']}</div>
|
src/agent.py
CHANGED
|
@@ -63,8 +63,13 @@ def analyze_image_and_text(
|
|
| 63 |
Run analysis via AMD Cloud backend.
|
| 64 |
Raises RuntimeError if the backend is unreachable.
|
| 65 |
Raises ValueError if the model response cannot be parsed.
|
|
|
|
|
|
|
|
|
|
| 66 |
"""
|
| 67 |
lang = language.lower()
|
| 68 |
prompt = _build_prompt(image_path, text_description, lang)
|
| 69 |
-
raw = generate_response(prompt, image_path=image_path)
|
| 70 |
-
|
|
|
|
|
|
|
|
|
| 63 |
Run analysis via AMD Cloud backend.
|
| 64 |
Raises RuntimeError if the backend is unreachable.
|
| 65 |
Raises ValueError if the model response cannot be parsed.
|
| 66 |
+
|
| 67 |
+
Returns dict with keys: diagnosis, severity, recommended_actions,
|
| 68 |
+
confidence_score, _metrics (latency_ms, total_tokens, tokens_per_sec).
|
| 69 |
"""
|
| 70 |
lang = language.lower()
|
| 71 |
prompt = _build_prompt(image_path, text_description, lang)
|
| 72 |
+
raw, metrics = generate_response(prompt, image_path=image_path)
|
| 73 |
+
result = _parse_response(raw)
|
| 74 |
+
result["_metrics"] = metrics
|
| 75 |
+
return result
|
src/model_loader.py
CHANGED
|
@@ -5,6 +5,7 @@ OpenAI-compatible API. No local model weights are loaded here.
|
|
| 5 |
import base64
|
| 6 |
import mimetypes
|
| 7 |
import os
|
|
|
|
| 8 |
|
| 9 |
import src.config as config
|
| 10 |
|
|
@@ -61,9 +62,15 @@ def check_connection() -> tuple[bool, str]:
|
|
| 61 |
return False, f"{type(exc).__name__}: {exc}"
|
| 62 |
|
| 63 |
|
| 64 |
-
def generate_response(prompt: str, image_path: str = None) -> str:
|
| 65 |
"""
|
| 66 |
-
Send a request to the vLLM endpoint and return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
Raises RuntimeError if the backend is unreachable or returns an error.
|
| 68 |
"""
|
| 69 |
try:
|
|
@@ -86,13 +93,26 @@ def generate_response(prompt: str, image_path: str = None) -> str:
|
|
| 86 |
else:
|
| 87 |
messages = [{"role": "user", "content": prompt}]
|
| 88 |
|
|
|
|
| 89 |
response = client.chat.completions.create(
|
| 90 |
model=config.MODEL_NAME,
|
| 91 |
messages=messages,
|
| 92 |
max_tokens=config.MAX_NEW_TOKENS,
|
| 93 |
temperature=config.TEMPERATURE,
|
| 94 |
)
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
except Exception as exc:
|
| 98 |
raise RuntimeError(f"AMD Cloud backend unreachable: {exc}") from exc
|
|
|
|
| 5 |
import base64
|
| 6 |
import mimetypes
|
| 7 |
import os
|
| 8 |
+
import time
|
| 9 |
|
| 10 |
import src.config as config
|
| 11 |
|
|
|
|
| 62 |
return False, f"{type(exc).__name__}: {exc}"
|
| 63 |
|
| 64 |
|
| 65 |
+
def generate_response(prompt: str, image_path: str = None) -> tuple[str, dict]:
|
| 66 |
"""
|
| 67 |
+
Send a request to the vLLM endpoint and return (text_output, metrics).
|
| 68 |
+
|
| 69 |
+
metrics keys:
|
| 70 |
+
latency_ms – wall-clock time for the API call in milliseconds
|
| 71 |
+
total_tokens – total tokens used (prompt + completion), or 0 if unavailable
|
| 72 |
+
tokens_per_sec – completion tokens / latency, or 0 if unavailable
|
| 73 |
+
|
| 74 |
Raises RuntimeError if the backend is unreachable or returns an error.
|
| 75 |
"""
|
| 76 |
try:
|
|
|
|
| 93 |
else:
|
| 94 |
messages = [{"role": "user", "content": prompt}]
|
| 95 |
|
| 96 |
+
t0 = time.perf_counter()
|
| 97 |
response = client.chat.completions.create(
|
| 98 |
model=config.MODEL_NAME,
|
| 99 |
messages=messages,
|
| 100 |
max_tokens=config.MAX_NEW_TOKENS,
|
| 101 |
temperature=config.TEMPERATURE,
|
| 102 |
)
|
| 103 |
+
latency_ms = (time.perf_counter() - t0) * 1000
|
| 104 |
+
|
| 105 |
+
usage = getattr(response, "usage", None)
|
| 106 |
+
completion_tokens = getattr(usage, "completion_tokens", 0) or 0
|
| 107 |
+
total_tokens = getattr(usage, "total_tokens", 0) or 0
|
| 108 |
+
tokens_per_sec = (completion_tokens / (latency_ms / 1000)) if latency_ms > 0 and completion_tokens > 0 else 0
|
| 109 |
+
|
| 110 |
+
metrics = {
|
| 111 |
+
"latency_ms": round(latency_ms),
|
| 112 |
+
"total_tokens": total_tokens,
|
| 113 |
+
"tokens_per_sec": round(tokens_per_sec, 1),
|
| 114 |
+
}
|
| 115 |
+
return response.choices[0].message.content, metrics
|
| 116 |
|
| 117 |
except Exception as exc:
|
| 118 |
raise RuntimeError(f"AMD Cloud backend unreachable: {exc}") from exc
|