dikheng commited on
Commit
c19c1f8
·
1 Parent(s): ac0cc43

feat: AMD Live Inference Dashboard — show latency, throughput, token count after each analysis

Browse files

- model_loader.generate_response() now measures wall-clock latency with time.perf_counter()
and extracts completion_tokens/total_tokens from the OpenAI usage object
- returns (text, metrics) tuple; metrics = {latency_ms, total_tokens, tokens_per_sec}
- agent.analyze_image_and_text() passes metrics through as result['_metrics']
- app.py: new _metrics_bar() renders 3 chips (Latency / Throughput / Tokens) above diagnosis card
- metrics labels are fully i18n'd across all 6 languages

Files changed (3) hide show
  1. app.py +57 -1
  2. src/agent.py +7 -2
  3. src/model_loader.py +23 -3
app.py CHANGED
@@ -49,6 +49,9 @@ _I18N = {
49
  "error_title": "Backend Unavailable",
50
  "error_body": "AMD Cloud backend is unreachable. Please try again later.",
51
  "examples_label": "Quick Examples",
 
 
 
52
  },
53
  "vn": {
54
  "img_label": "Tải lên hình ảnh y tế",
@@ -68,6 +71,9 @@ _I18N = {
68
  "error_title": "Hệ thống không khả dụng",
69
  "error_body": "Không thể kết nối AMD Cloud. Vui lòng thử lại sau.",
70
  "examples_label": "Ví dụ nhanh",
 
 
 
71
  },
72
  "zh": {
73
  "img_label": "上传医学图像",
@@ -87,6 +93,9 @@ _I18N = {
87
  "error_title": "后端不可用",
88
  "error_body": "AMD Cloud 后端无法访问,请稍后重试。",
89
  "examples_label": "快速示例",
 
 
 
90
  },
91
  "es": {
92
  "img_label": "Subir imagen médica",
@@ -106,6 +115,9 @@ _I18N = {
106
  "error_title": "Backend no disponible",
107
  "error_body": "El backend de AMD Cloud no está disponible. Por favor, inténtelo más tarde.",
108
  "examples_label": "Ejemplos rápidos",
 
 
 
109
  },
110
  "fr": {
111
  "img_label": "Télécharger une image médicale",
@@ -125,6 +137,9 @@ _I18N = {
125
  "error_title": "Backend indisponible",
126
  "error_body": "Le backend AMD Cloud est inaccessible. Veuillez réessayer plus tard.",
127
  "examples_label": "Exemples rapides",
 
 
 
128
  },
129
  "ja": {
130
  "img_label": "医療画像をアップロード",
@@ -144,6 +159,9 @@ _I18N = {
144
  "error_title": "バックエンド利用不可",
145
  "error_body": "AMD Cloudバックエンドに接続できません。後でもう一度お試しください。",
146
  "examples_label": "クイック例",
 
 
 
147
  },
148
  }
149
 
@@ -208,6 +226,41 @@ def _severity_badge(severity: str) -> str:
208
  )
209
 
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  def _confidence_bar(score: int, label: str) -> str:
212
  if score == 0:
213
  return ""
@@ -240,6 +293,7 @@ def _build_result_html(result: dict, lang: str) -> str:
240
  sev = _SEVERITY_TRANSLATE.get(lang, _SEVERITY_TRANSLATE["en"]).get(sev_en, sev_en)
241
  actions = result.get("recommended_actions", [])
242
  score = result.get("confidence_score", 0)
 
243
 
244
  actions_html = "".join(
245
  f"<li style='margin:5px 0; color:#d1d5db;'>{a}</li>" for a in actions
@@ -255,7 +309,7 @@ def _build_result_html(result: dict, lang: str) -> str:
255
  <div style='background:#111827; border:1px solid #ED1C24; border-radius:12px;
256
  padding:20px; font-family:Arial,sans-serif; color:#f9fafb;'>
257
 
258
- <div style='display:flex; align-items:center; gap:10px; margin-bottom:16px;'>
259
  <div style='background:#ED1C24; width:4px; border-radius:2px; height:36px;'></div>
260
  <div>
261
  <div style='font-size:1.1rem; font-weight:700; color:#ED1C24;'>
@@ -265,6 +319,8 @@ def _build_result_html(result: dict, lang: str) -> str:
265
  </div>
266
  </div>
267
 
 
 
268
  <div style='background:#1f2937; border-radius:8px; padding:14px; margin-bottom:12px;'>
269
  <div style='font-size:0.75rem; text-transform:uppercase; letter-spacing:.05em;
270
  color:#9ca3af; margin-bottom:6px;'>{t['diag_label']}</div>
 
49
  "error_title": "Backend Unavailable",
50
  "error_body": "AMD Cloud backend is unreachable. Please try again later.",
51
  "examples_label": "Quick Examples",
52
+ "metrics_latency": "Latency",
53
+ "metrics_throughput": "Throughput",
54
+ "metrics_tokens": "tokens",
55
  },
56
  "vn": {
57
  "img_label": "Tải lên hình ảnh y tế",
 
71
  "error_title": "Hệ thống không khả dụng",
72
  "error_body": "Không thể kết nối AMD Cloud. Vui lòng thử lại sau.",
73
  "examples_label": "Ví dụ nhanh",
74
+ "metrics_latency": "Độ trễ",
75
+ "metrics_throughput": "Thông lượng",
76
+ "metrics_tokens": "token",
77
  },
78
  "zh": {
79
  "img_label": "上传医学图像",
 
93
  "error_title": "后端不可用",
94
  "error_body": "AMD Cloud 后端无法访问,请稍后重试。",
95
  "examples_label": "快速示例",
96
+ "metrics_latency": "延迟",
97
+ "metrics_throughput": "吞吐量",
98
+ "metrics_tokens": "tokens",
99
  },
100
  "es": {
101
  "img_label": "Subir imagen médica",
 
115
  "error_title": "Backend no disponible",
116
  "error_body": "El backend de AMD Cloud no está disponible. Por favor, inténtelo más tarde.",
117
  "examples_label": "Ejemplos rápidos",
118
+ "metrics_latency": "Latencia",
119
+ "metrics_throughput": "Rendimiento",
120
+ "metrics_tokens": "tokens",
121
  },
122
  "fr": {
123
  "img_label": "Télécharger une image médicale",
 
137
  "error_title": "Backend indisponible",
138
  "error_body": "Le backend AMD Cloud est inaccessible. Veuillez réessayer plus tard.",
139
  "examples_label": "Exemples rapides",
140
+ "metrics_latency": "Latence",
141
+ "metrics_throughput": "Débit",
142
+ "metrics_tokens": "tokens",
143
  },
144
  "ja": {
145
  "img_label": "医療画像をアップロード",
 
159
  "error_title": "バックエンド利用不可",
160
  "error_body": "AMD Cloudバックエンドに接続できません。後でもう一度お試しください。",
161
  "examples_label": "クイック例",
162
+ "metrics_latency": "レイテンシ",
163
+ "metrics_throughput": "スループット",
164
+ "metrics_tokens": "トークン",
165
  },
166
  }
167
 
 
226
  )
227
 
228
 
229
+ def _metrics_bar(metrics: dict, t: dict) -> str:
230
+ latency_ms = metrics.get("latency_ms", 0)
231
+ tok_per_sec = metrics.get("tokens_per_sec", 0)
232
+ total_tokens = metrics.get("total_tokens", 0)
233
+
234
+ def _chip(label: str, value: str) -> str:
235
+ return (
236
+ f"<span style='display:inline-flex; flex-direction:column; align-items:center; "
237
+ f"background:#0f172a; border:1px solid #374151; border-radius:8px; "
238
+ f"padding:6px 14px; min-width:80px;'>"
239
+ f"<span style='font-size:1rem; font-weight:700; color:#ED1C24;'>{value}</span>"
240
+ f"<span style='font-size:0.65rem; color:#6b7280; text-transform:uppercase; "
241
+ f"letter-spacing:.05em; margin-top:2px;'>{label}</span>"
242
+ f"</span>"
243
+ )
244
+
245
+ latency_val = f"{latency_ms:,} ms" if latency_ms else "—"
246
+ throughput_val = f"{tok_per_sec} {t['metrics_tokens']}/s" if tok_per_sec else "—"
247
+ tokens_val = f"{total_tokens:,} {t['metrics_tokens']}" if total_tokens else "—"
248
+
249
+ return (
250
+ f"<div style='display:flex; gap:8px; flex-wrap:wrap; align-items:center; "
251
+ f"justify-content:space-between; background:#1f2937; border-radius:8px; "
252
+ f"padding:10px 14px; margin-bottom:12px;'>"
253
+ f"<div style='font-size:0.68rem; color:#6b7280; font-family:monospace; "
254
+ f"white-space:nowrap;'>⚡ AMD MI300X · ROCm · vLLM</div>"
255
+ f"<div style='display:flex; gap:8px; flex-wrap:wrap;'>"
256
+ f"{_chip(t['metrics_latency'], latency_val)}"
257
+ f"{_chip(t['metrics_throughput'], throughput_val)}"
258
+ f"{_chip(t['metrics_tokens'], tokens_val)}"
259
+ f"</div>"
260
+ f"</div>"
261
+ )
262
+
263
+
264
  def _confidence_bar(score: int, label: str) -> str:
265
  if score == 0:
266
  return ""
 
293
  sev = _SEVERITY_TRANSLATE.get(lang, _SEVERITY_TRANSLATE["en"]).get(sev_en, sev_en)
294
  actions = result.get("recommended_actions", [])
295
  score = result.get("confidence_score", 0)
296
+ metrics = result.get("_metrics", {})
297
 
298
  actions_html = "".join(
299
  f"<li style='margin:5px 0; color:#d1d5db;'>{a}</li>" for a in actions
 
309
  <div style='background:#111827; border:1px solid #ED1C24; border-radius:12px;
310
  padding:20px; font-family:Arial,sans-serif; color:#f9fafb;'>
311
 
312
+ <div style='display:flex; align-items:center; gap:10px; margin-bottom:12px;'>
313
  <div style='background:#ED1C24; width:4px; border-radius:2px; height:36px;'></div>
314
  <div>
315
  <div style='font-size:1.1rem; font-weight:700; color:#ED1C24;'>
 
319
  </div>
320
  </div>
321
 
322
+ {_metrics_bar(metrics, t)}
323
+
324
  <div style='background:#1f2937; border-radius:8px; padding:14px; margin-bottom:12px;'>
325
  <div style='font-size:0.75rem; text-transform:uppercase; letter-spacing:.05em;
326
  color:#9ca3af; margin-bottom:6px;'>{t['diag_label']}</div>
src/agent.py CHANGED
@@ -63,8 +63,13 @@ def analyze_image_and_text(
63
  Run analysis via AMD Cloud backend.
64
  Raises RuntimeError if the backend is unreachable.
65
  Raises ValueError if the model response cannot be parsed.
 
 
 
66
  """
67
  lang = language.lower()
68
  prompt = _build_prompt(image_path, text_description, lang)
69
- raw = generate_response(prompt, image_path=image_path)
70
- return _parse_response(raw)
 
 
 
63
  Run analysis via AMD Cloud backend.
64
  Raises RuntimeError if the backend is unreachable.
65
  Raises ValueError if the model response cannot be parsed.
66
+
67
+ Returns dict with keys: diagnosis, severity, recommended_actions,
68
+ confidence_score, _metrics (latency_ms, total_tokens, tokens_per_sec).
69
  """
70
  lang = language.lower()
71
  prompt = _build_prompt(image_path, text_description, lang)
72
+ raw, metrics = generate_response(prompt, image_path=image_path)
73
+ result = _parse_response(raw)
74
+ result["_metrics"] = metrics
75
+ return result
src/model_loader.py CHANGED
@@ -5,6 +5,7 @@ OpenAI-compatible API. No local model weights are loaded here.
5
  import base64
6
  import mimetypes
7
  import os
 
8
 
9
  import src.config as config
10
 
@@ -61,9 +62,15 @@ def check_connection() -> tuple[bool, str]:
61
  return False, f"{type(exc).__name__}: {exc}"
62
 
63
 
64
- def generate_response(prompt: str, image_path: str = None) -> str:
65
  """
66
- Send a request to the vLLM endpoint and return the model's text output.
 
 
 
 
 
 
67
  Raises RuntimeError if the backend is unreachable or returns an error.
68
  """
69
  try:
@@ -86,13 +93,26 @@ def generate_response(prompt: str, image_path: str = None) -> str:
86
  else:
87
  messages = [{"role": "user", "content": prompt}]
88
 
 
89
  response = client.chat.completions.create(
90
  model=config.MODEL_NAME,
91
  messages=messages,
92
  max_tokens=config.MAX_NEW_TOKENS,
93
  temperature=config.TEMPERATURE,
94
  )
95
- return response.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  except Exception as exc:
98
  raise RuntimeError(f"AMD Cloud backend unreachable: {exc}") from exc
 
5
  import base64
6
  import mimetypes
7
  import os
8
+ import time
9
 
10
  import src.config as config
11
 
 
62
  return False, f"{type(exc).__name__}: {exc}"
63
 
64
 
65
+ def generate_response(prompt: str, image_path: str = None) -> tuple[str, dict]:
66
  """
67
+ Send a request to the vLLM endpoint and return (text_output, metrics).
68
+
69
+ metrics keys:
70
+ latency_ms – wall-clock time for the API call in milliseconds
71
+ total_tokens – total tokens used (prompt + completion), or 0 if unavailable
72
+ tokens_per_sec – completion tokens / latency, or 0 if unavailable
73
+
74
  Raises RuntimeError if the backend is unreachable or returns an error.
75
  """
76
  try:
 
93
  else:
94
  messages = [{"role": "user", "content": prompt}]
95
 
96
+ t0 = time.perf_counter()
97
  response = client.chat.completions.create(
98
  model=config.MODEL_NAME,
99
  messages=messages,
100
  max_tokens=config.MAX_NEW_TOKENS,
101
  temperature=config.TEMPERATURE,
102
  )
103
+ latency_ms = (time.perf_counter() - t0) * 1000
104
+
105
+ usage = getattr(response, "usage", None)
106
+ completion_tokens = getattr(usage, "completion_tokens", 0) or 0
107
+ total_tokens = getattr(usage, "total_tokens", 0) or 0
108
+ tokens_per_sec = (completion_tokens / (latency_ms / 1000)) if latency_ms > 0 and completion_tokens > 0 else 0
109
+
110
+ metrics = {
111
+ "latency_ms": round(latency_ms),
112
+ "total_tokens": total_tokens,
113
+ "tokens_per_sec": round(tokens_per_sec, 1),
114
+ }
115
+ return response.choices[0].message.content, metrics
116
 
117
  except Exception as exc:
118
  raise RuntimeError(f"AMD Cloud backend unreachable: {exc}") from exc