File size: 68,508 Bytes
4f7e639 5db2947 4f7e639 5db2947 4f7e639 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 | import os
import time
from pathlib import Path
import gradio as gr
import requests
APP_ROOT = Path(__file__).parent
SHOWCASE_DIR = APP_ROOT / "showcase"
GITHUB_URL = "https://github.com/bladedevoff/studiomi300"
API_URL = (os.environ.get("STUDIO_API_URL", "") or "").rstrip("/")
API_TOKEN = os.environ.get("STUDIO_API_TOKEN", "")
API_HEADERS = {"X-API-Token": API_TOKEN} if API_TOKEN else {}
# local mp4 cache β Space downloads from droplet over HTTP (server-side, no
# mixed-content), then serves to browser over Gradio's HTTPS file route.
DEMO_CACHE = APP_ROOT / "demo_cache"
DEMO_CACHE.mkdir(exist_ok=True)
def cache_demo_mp4(job_id):
"""Fetch demo mp4 from droplet API into the Space's local cache. Returns Path or None."""
p = DEMO_CACHE / f"{job_id}.mp4"
if p.exists() and p.stat().st_size > 1024:
return p
if not API_URL:
return None
try:
r = requests.get(f"{API_URL}/demos/{job_id}.mp4", timeout=120, stream=True)
if r.status_code != 200:
return None
with open(p, "wb") as f:
for chunk in r.iter_content(64 * 1024):
f.write(chunk)
return p
except requests.RequestException:
return None
SHOWCASE_REELS = [
{
"title": "San Francisco walk - golden hour to blue hour",
"video": "sf_walk.mp4",
"logline": "A young woman walks alone down a steep Pacific Heights street, past painted Victorians and rolling fog, to a quiet overlook of the Golden Gate Bridge as the light shifts to blue hour.",
"prompt": (
"30-second cinematic reel: a young woman walks alone through San Francisco "
"at golden hour - down a steep Pacific Heights street with bay views, past "
"painted Victorian houses, fog rolling in over the Pacific, ending at a "
"quiet overlook of the Golden Gate Bridge as the light shifts to blue hour"
),
"music_style": "intimate ambient piano with a soft synth pad, 75 BPM, contemplative",
"vo_lang": "American English (Director picked from setting)",
"render_time_min": 81,
"shots": 6,
"stack_used": [
"Director Agent: Qwen3.5-35B-A3B (vLLM, AITER MoE)",
"Vision Critic: same Qwen3.5 reload, 4 frames per clip",
"Image: FLUX.2 [klein] 4B reference editing",
"Video: Wan2.2-I2V-A14B + FBCache + torch.compile + FLF2V on cut:false arcs",
"Music: ACE-Step v1 3.5B",
"Voice-over: Kokoro-82M, per-shot wavs, ffmpeg adelay sync",
],
},
]
HACKATHON_BADGE = "amd-hackathon-2026"
def fetch_demos(limit=50):
if not API_URL:
return []
try:
r = requests.get(f"{API_URL}/demos", params={"limit": limit}, timeout=10)
if r.status_code == 200:
return r.json()
except requests.RequestException:
pass
return []
def backend_health():
if not API_URL:
return "not configured"
try:
r = requests.get(f"{API_URL}/health", timeout=5)
if r.status_code == 200:
j = r.json()
return "busy (rendering)" if j.get("gpu_busy") else "idle"
except requests.RequestException:
pass
return "offline"
def render_demo_card(d):
prompt = (d.get("prompt") or "")[:240]
duration = d.get("duration_s") or 0
p = cache_demo_mp4(d["id"])
if p is None:
return ""
src = f"/gradio_api/file={p}" # Gradio HTTPS file route
return (
f'<div class="demo-card">'
f'<video src="{src}" controls preload="metadata" loop muted playsinline></video>'
f'<div class="demo-prompt">{prompt}</div>'
f'<div class="demo-meta">{int(duration)}s render</div>'
f'</div>'
)
def render_demo_grid(demos, top_n=10):
if not demos:
if not API_URL:
msg = "Live demo backend not configured."
else:
msg = "No live generations yet. Be the first."
return f'<div class="demo-empty">{msg}</div>'
head = demos[:top_n]
tail = demos[top_n:]
cards = "".join(render_demo_card(d) for d in head)
out = f'<div class="demo-grid">{cards}</div>'
if tail:
more = "".join(render_demo_card(d) for d in tail)
out += (
f'<details class="demo-more"><summary>Show {len(tail)} older'
f'</summary><div class="demo-grid">{more}</div></details>'
)
return out
STAGE_LABELS = {
"queued": "queued",
"starting": "starting up",
"klein_loading": "loading FLUX.2 klein 4B",
"keyframe_starting": "painting keyframe",
"keyframe_ready": "keyframe ready",
"wan_loading": "loading Wan2.2-I2V-A14B",
"wan_rendering": "animating with Wan2.2",
"rendered": "video rendered",
"music_starting": "generating music (ACE-Step)",
"music_ready": "music ready",
"music_skipped": "music skipped",
"music_failed": "music failed (silent video)",
"mix_starting": "mixing audio onto video",
"mix_done": "final mp4 ready",
"completed": "done",
"done": "done",
}
STAGE_PROGRESS = {
"queued": 0.02, "starting": 0.04,
"klein_loading": 0.08, "keyframe_starting": 0.12, "keyframe_ready": 0.18,
"wan_loading": 0.24,
"wan_rendering": 0.80,
"rendered": 0.86,
"music_starting": 0.88,
"music_ready": 0.95,
"music_skipped": 0.95, "music_failed": 0.95,
"mix_starting": 0.97,
"mix_done": 1.0,
"completed": 1.0, "done": 1.0,
}
def submit_demo(prompt, request: gr.Request = None):
if not API_URL:
raise gr.Error("Live demo backend not configured. Visit later.")
p = (prompt or "").strip()
if len(p) < 20:
raise gr.Error("Prompt must be at least 20 characters.")
if len(p) > 1500:
raise gr.Error("Prompt too long (1500 char max).")
headers = dict(API_HEADERS)
if request is not None:
try:
fwd = request.headers.get("x-forwarded-for", "") if request.headers else ""
user_ip = fwd.split(",")[0].strip() if fwd else (request.client.host if request.client else "")
if user_ip:
headers["X-Forwarded-For"] = user_ip
ua = request.headers.get("user-agent", "") if request.headers else ""
if ua:
headers["X-Original-User-Agent"] = ua[:200]
except (AttributeError, KeyError):
pass
try:
r = requests.post(f"{API_URL}/jobs", headers=headers, json={
"prompt": p, "mode": "demo", "use_critic": False,
}, timeout=15)
except requests.RequestException as e:
raise gr.Error(f"backend unreachable: {e}")
if r.status_code == 401:
raise gr.Error("backend rejected token (Space secret out of sync)")
if r.status_code == 422:
raise gr.Error("Prompt rejected by content policy. Please rephrase.")
if r.status_code != 200:
raise gr.Error(f"submit failed: {r.text[:200]}")
job_id = r.json()["job_id"]
yield f"**Job {job_id}** Β· submitted, waiting for GPU\n\n> {p}", None, gr.update()
deadline = time.time() + 900
last_render = ""
while time.time() < deadline:
time.sleep(2)
try:
meta = requests.get(f"{API_URL}/jobs/{job_id}", headers=API_HEADERS, timeout=10).json()
except requests.RequestException:
continue
stage = meta.get("stage", "queued")
status = meta.get("status", "queued")
elapsed = int(time.time() - meta.get("started", time.time())) if meta.get("started") else 0
if status == "queued":
pos = meta.get("queue_position", 0)
qsize = meta.get("queue_size", 1)
if pos:
status_md = f"**Job {job_id}** Β· queued at **position {pos} of {qsize}**, waiting for GPU\n\n> {p}"
else:
status_md = f"**Job {job_id}** Β· queued\n\n> {p}"
else:
label = STAGE_LABELS.get(stage, stage)
status_md = f"**Job {job_id}** Β· {label} Β· {elapsed}s elapsed\n\n> {p}"
if status == "done":
duration = int((meta.get("finished") or 0) - (meta.get("started") or 0))
local = cache_demo_mp4(job_id) # download mp4 to Space's local fs
done_md = f"### Done in {duration}s\n\n**Job {job_id}** Β· saved to server, added to gallery below.\n\n> {p}"
yield done_md, str(local) if local else None, gr.update(value=render_demo_grid(fetch_demos()))
return
if status == "failed":
raise gr.Error(f"job failed at stage `{stage}`. Check droplet logs.")
if status_md != last_render:
last_render = status_md
yield status_md, None, gr.update()
raise gr.Error("timeout (>15 min). The droplet may be stuck or queue too long.")
def refresh_gallery():
return render_demo_grid(fetch_demos())
CUSTOM_CSS = r"""
/* Force dark theme regardless of system / browser preference.
Overrides Gradio's neutral scale (used as bg in light mode) to dark values
so the page renders the same way whether or not .dark class is present. */
:root {
--grad-a: #a78bfa;
--grad-b: #f472b6;
--grad-c: #fbbf24;
--bg-card: #0f172a;
--bg-deep: #020617;
--border-card: rgba(167, 139, 250, 0.32);
--text-main: #f1f5f9;
--text-mute: #94a3b8;
color-scheme: dark;
}
html, body, gradio-app, .gradio-container {
background: #020617 !important;
color: #f1f5f9 !important;
}
.gradio-container,
gradio-app {
--body-background-fill: #020617;
--body-background-fill-dark: #020617;
--background-fill-primary: #0f172a;
--background-fill-primary-dark: #0f172a;
--background-fill-secondary: #1e293b;
--background-fill-secondary-dark: #1e293b;
--block-background-fill: #0f172a;
--block-background-fill-dark: #0f172a;
--block-border-color: rgba(167, 139, 250, 0.32);
--block-border-color-dark: rgba(167, 139, 250, 0.32);
--border-color-primary: rgba(167, 139, 250, 0.32);
--border-color-primary-dark: rgba(167, 139, 250, 0.32);
--input-background-fill: #0f172a;
--input-background-fill-dark: #0f172a;
--input-border-color: rgba(167, 139, 250, 0.32);
--input-border-color-dark: rgba(167, 139, 250, 0.32);
--button-secondary-background-fill: #1e293b;
--button-secondary-background-fill-dark: #1e293b;
--button-secondary-text-color: #f1f5f9;
--button-secondary-text-color-dark: #f1f5f9;
--body-text-color: #f1f5f9;
--body-text-color-dark: #f1f5f9;
--body-text-color-subdued: #94a3b8;
--body-text-color-subdued-dark: #94a3b8;
--color-accent: #a78bfa;
--color-accent-soft: rgba(167, 139, 250, 0.15);
--neutral-50: #1e293b;
--neutral-100: #1e293b;
--neutral-200: #334155;
--neutral-300: #475569;
--neutral-400: #64748b;
--neutral-500: #94a3b8;
--neutral-600: #cbd5e1;
--neutral-700: #e2e8f0;
--neutral-800: #f1f5f9;
--neutral-900: #f8fafc;
}
.gradio-container { max-width: 1100px !important; margin: 0 auto !important; padding-left: 1rem !important; padding-right: 1rem !important; }
.app, .main, footer { margin: 0 auto !important; }
/* hero - always dark backdrop so the gradient text stays vivid in light/dark themes alike */
.hero {
text-align: center;
padding: 3rem 1.2rem 2rem 1.2rem;
background:
radial-gradient(ellipse 70% 90% at 50% 0%, rgba(244, 114, 182, .35), transparent 65%),
radial-gradient(ellipse 70% 90% at 50% 100%, rgba(167, 139, 250, .30), transparent 65%),
linear-gradient(180deg, #0b1120 0%, #050816 100%);
border-radius: 22px;
margin-bottom: 1rem;
border: 1px solid rgba(167, 139, 250, .25);
box-shadow: 0 14px 50px rgba(124, 58, 237, .18);
}
.hero-title {
font-size: clamp(2.6rem, 6vw, 4.6rem);
font-weight: 900;
line-height: 1;
letter-spacing: -0.03em;
background: linear-gradient(95deg, #c4b5fd 0%, #f9a8d4 50%, #fde68a 100%);
-webkit-background-clip: text;
background-clip: text;
color: transparent;
-webkit-text-fill-color: transparent;
text-shadow: 0 4px 36px rgba(244, 114, 182, .25);
margin: 0;
}
.hero-tagline {
font-size: clamp(1.05rem, 2vw, 1.35rem);
color: #e2e8f0;
margin-top: 0.85rem;
font-weight: 500;
max-width: 720px;
margin-left: auto;
margin-right: auto;
line-height: 1.5;
}
.badge-row { display: flex; justify-content: center; gap: 0.5rem; flex-wrap: wrap; margin-top: 1.4rem; }
.badge {
background: rgba(15, 23, 42, 0.85);
border: 1px solid rgba(148, 163, 184, .25);
padding: 0.4rem 0.95rem;
border-radius: 999px;
font-size: 0.83rem;
font-weight: 700;
letter-spacing: 0.01em;
backdrop-filter: blur(4px);
}
.badge-amd { color: #fca5a5; }
.badge-rocm { color: #fde68a; }
.badge-license { color: #6ee7b7; }
.badge-tag { color: #c4b5fd; }
/* stats strip - always dark tiles with bright gradient numbers */
.stat-strip { display: grid; grid-template-columns: repeat(4, 1fr); gap: 0.75rem; margin: 1.2rem 0 1.8rem 0; }
.stat-tile {
background: linear-gradient(160deg, #131c33 0%, #0a1023 100%);
border: 1px solid var(--border-card);
border-radius: 14px;
padding: 1.1rem 0.8rem;
text-align: center;
box-shadow: 0 6px 22px rgba(124, 58, 237, .08);
}
.stat-num {
font-size: 2.2rem;
font-weight: 900;
background: linear-gradient(95deg, #c4b5fd 0%, #f9a8d4 100%);
-webkit-background-clip: text;
background-clip: text;
color: transparent;
-webkit-text-fill-color: transparent;
line-height: 1.05;
text-shadow: 0 2px 18px rgba(244, 114, 182, .28);
}
.stat-lbl { font-size: 0.76rem; color: #cbd5e1; margin-top: 0.4rem; text-transform: uppercase; letter-spacing: 0.06em; font-weight: 600; }
@media (max-width: 720px) { .stat-strip { grid-template-columns: repeat(2, 1fr); } }
/* pipeline diagram */
.pipeline {
display: grid;
grid-template-columns: repeat(2, 1fr);
gap: 0.85rem;
margin: 1.5rem 0;
}
@media (max-width: 720px) { .pipeline { grid-template-columns: 1fr; } }
.stage {
position: relative;
background: linear-gradient(160deg, rgba(124, 58, 237, .07), rgba(15, 23, 42, .72));
border: 1px solid var(--border-card);
border-radius: 14px;
padding: 1.05rem 1.15rem;
display: flex;
gap: 0.85rem;
align-items: flex-start;
transition: transform .12s ease, border-color .12s ease;
}
.stage:hover { transform: translateY(-2px); border-color: rgba(236, 72, 153, .45); }
.stage-num {
flex: 0 0 2.4rem;
height: 2.4rem;
border-radius: 12px;
background: linear-gradient(135deg, var(--grad-a), var(--grad-b));
color: white;
font-weight: 800;
font-size: 1.05rem;
display: flex;
align-items: center;
justify-content: center;
}
.stage-body { flex: 1; }
.stage-title { font-weight: 700; font-size: 1.02rem; margin: 0 0 0.2rem 0; color: #e2e8f0; }
.stage-meta { font-size: 0.78rem; color: #fbbf24; font-weight: 600; margin-bottom: 0.35rem; letter-spacing: 0.02em; }
.stage-desc { font-size: 0.88rem; color: #cbd5e1; line-height: 1.5; margin: 0; }
/* failure label table */
.label-grid {
display: grid;
grid-template-columns: repeat(2, 1fr);
gap: 0.65rem;
margin: 1rem 0 0.5rem 0;
}
@media (max-width: 720px) { .label-grid { grid-template-columns: 1fr; } }
.label-card {
background: var(--bg-card);
border: 1px solid var(--border-card);
border-radius: 12px;
padding: 0.85rem 1rem;
}
.label-name {
font-family: 'JetBrains Mono', ui-monospace, monospace;
font-size: 0.78rem;
color: #f87171;
font-weight: 700;
letter-spacing: 0.02em;
background: rgba(248, 113, 113, .08);
padding: 0.2rem 0.45rem;
border-radius: 6px;
display: inline-block;
}
.label-desc { font-size: 0.85rem; color: #cbd5e1; margin-top: 0.5rem; line-height: 1.45; }
.label-fix { font-size: 0.8rem; color: #34d399; margin-top: 0.4rem; }
/* incident card */
.incident {
border-left: 3px solid var(--grad-b);
background: linear-gradient(95deg, rgba(236, 72, 153, .08), transparent 70%);
padding: 1rem 1.2rem;
border-radius: 10px;
margin: 0.85rem 0;
}
.incident-date { font-size: 0.75rem; color: #f87171; font-weight: 700; letter-spacing: 0.03em; text-transform: uppercase; }
.incident-title { font-weight: 700; font-size: 1.05rem; color: #e2e8f0; margin: 0.25rem 0 0.4rem 0; }
.incident-body { font-size: 0.9rem; color: #cbd5e1; line-height: 1.55; }
.incident-fix { font-size: 0.85rem; color: #86efac; margin-top: 0.4rem; }
/* perf bar */
.perf {
background: var(--bg-card);
border-radius: 10px;
padding: 0.65rem 0.85rem;
margin: 0.4rem 0;
display: grid;
grid-template-columns: 1fr 5rem;
gap: 0.75rem;
align-items: center;
}
.perf-label { font-size: 0.88rem; color: #e2e8f0; }
.perf-val { font-weight: 700; color: #6ee7b7; text-align: right; font-size: 0.9rem; }
.perf-bar { grid-column: 1 / -1; height: 6px; background: rgba(148, 163, 184, .14); border-radius: 4px; overflow: hidden; }
.perf-fill { height: 100%; background: linear-gradient(90deg, var(--grad-a), var(--grad-b)); border-radius: 4px; }
/* chart blocks */
.chart-card {
background: linear-gradient(160deg, #0f172a 0%, #060b1c 100%);
border: 1px solid var(--border-card);
border-radius: 14px;
padding: 1.1rem 1.2rem;
margin: 1rem 0;
}
.chart-title {
font-weight: 700;
font-size: 1rem;
color: #e2e8f0;
margin: 0 0 0.2rem 0;
letter-spacing: 0.01em;
}
.chart-sub { color: var(--text-mute); font-size: 0.82rem; margin: 0 0 0.85rem 0; }
/* horizontal bar chart - one row */
.hbar-row {
display: grid;
grid-template-columns: 12rem 1fr 4.5rem;
gap: 0.7rem;
align-items: center;
padding: 0.32rem 0;
font-size: 0.84rem;
}
.hbar-label { color: #e2e8f0; }
.hbar-track { background: rgba(148, 163, 184, .12); height: 12px; border-radius: 4px; overflow: hidden; }
.hbar-fill {
height: 100%;
border-radius: 4px;
background: linear-gradient(90deg, #c4b5fd, #f472b6);
display: flex; align-items: center; justify-content: flex-end;
padding-right: 0.4rem;
box-shadow: 0 0 12px rgba(244, 114, 182, .35);
}
.hbar-val { color: #6ee7b7; font-weight: 700; text-align: right; font-feature-settings: "tnum"; }
.hbar-val.muted { color: #fbbf24; }
.hbar-fill.warm { background: linear-gradient(90deg, #fde68a, #f97316); box-shadow: 0 0 12px rgba(249, 115, 22, .35); }
.hbar-fill.cold { background: linear-gradient(90deg, #67e8f9, #818cf8); box-shadow: 0 0 12px rgba(129, 140, 248, .35); }
@media (max-width: 720px) { .hbar-row { grid-template-columns: 8rem 1fr 3.5rem; font-size: 0.78rem; } }
/* stacked bar (one row, multiple segments) */
.stack-bar {
width: 100%;
height: 26px;
border-radius: 6px;
display: flex;
overflow: hidden;
margin: 0.4rem 0;
border: 1px solid rgba(148, 163, 184, .15);
}
.stack-seg {
height: 100%;
display: flex; align-items: center; justify-content: center;
font-size: 0.72rem;
color: white;
font-weight: 700;
text-shadow: 0 1px 2px rgba(0,0,0,.4);
white-space: nowrap;
overflow: hidden;
}
.stack-legend { display: flex; flex-wrap: wrap; gap: 0.6rem; margin-top: 0.5rem; font-size: 0.78rem; color: #cbd5e1; }
.stack-dot { display: inline-block; width: 0.65rem; height: 0.65rem; border-radius: 50%; margin-right: 0.3rem; vertical-align: middle; }
/* placeholder card while reel renders */
.placeholder {
border: 1.5px dashed rgba(148, 163, 184, .3);
border-radius: 14px;
padding: 2.2rem 1.5rem;
text-align: center;
background: linear-gradient(160deg, rgba(124, 58, 237, .06), transparent);
}
.placeholder-emoji { font-size: 2.4rem; margin-bottom: 0.6rem; }
.placeholder-title { font-weight: 700; font-size: 1.1rem; color: #e2e8f0; margin-bottom: 0.4rem; }
.placeholder-body { font-size: 0.92rem; color: var(--text-mute); max-width: 520px; margin: 0 auto; line-height: 1.5; }
/* live demo */
.demo-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 0.85rem;
margin: 0.6rem 0 0.3rem 0;
}
.demo-card {
background: linear-gradient(160deg, #131c33 0%, #0a1023 100%);
border: 1px solid var(--border-card);
border-radius: 12px;
padding: 0.6rem;
display: flex; flex-direction: column; gap: 0.4rem;
}
.demo-card video { width: 100%; border-radius: 8px; background: #000; aspect-ratio: 16/9; object-fit: cover; }
.demo-prompt { font-size: 0.82rem; color: #cbd5e1; line-height: 1.35; }
.demo-meta { font-size: 0.72rem; color: var(--text-mute); letter-spacing: 0.04em; }
.demo-empty { padding: 1.5rem 1rem; text-align: center; color: var(--text-mute); border: 1.5px dashed rgba(148,163,184,.25); border-radius: 12px; }
.demo-more { margin-top: 0.8rem; }
.demo-more summary { cursor: pointer; color: #c4b5fd; font-weight: 600; padding: 0.6rem 1rem; background: rgba(124,58,237,.08); border-radius: 8px; user-select: none; }
.demo-more summary:hover { background: rgba(124,58,237,.16); }
.demo-more[open] summary { margin-bottom: 0.8rem; }
/* footer */
.footer {
text-align: center;
color: var(--text-mute);
font-size: 0.85rem;
padding: 1.5rem 0 0.5rem 0;
border-top: 1px solid rgba(148, 163, 184, .12);
margin-top: 2rem;
}
.footer a { color: #a78bfa; text-decoration: none; }
.footer a:hover { color: #ec4899; }
/* mobile - tighten everything for <=720px */
@media (max-width: 720px) {
.gradio-container { padding-left: 0.5rem !important; padding-right: 0.5rem !important; }
.hero { padding: 1.5rem 0.7rem 1.1rem 0.7rem; border-radius: 14px; margin-bottom: 0.6rem; }
.hero-title { font-size: 2.4rem !important; line-height: 1.05; }
.hero-tagline { font-size: 0.98rem; margin-top: 0.6rem; }
.badge-row { gap: 0.35rem; margin-top: 1rem; }
.badge { font-size: 0.72rem; padding: 0.3rem 0.7rem; }
.stat-strip { gap: 0.5rem; margin: 0.7rem 0 1.1rem 0; }
.stat-tile { padding: 0.75rem 0.4rem; }
.stat-num { font-size: 1.55rem; }
.stat-lbl { font-size: 0.62rem; letter-spacing: 0.04em; }
.stage { padding: 0.85rem 0.95rem; gap: 0.6rem; }
.stage-num { flex: 0 0 2rem; height: 2rem; font-size: 0.95rem; border-radius: 9px; }
.stage-title { font-size: 0.96rem; }
.stage-meta { font-size: 0.7rem; }
.stage-desc { font-size: 0.82rem; line-height: 1.45; }
.label-card { padding: 0.7rem 0.85rem; }
.label-name { font-size: 0.7rem; padding: 0.18rem 0.4rem; }
.label-desc { font-size: 0.78rem; }
.label-fix { font-size: 0.74rem; }
.demo-grid { gap: 0.6rem; }
.demo-card { padding: 0.45rem; }
.demo-prompt { font-size: 0.78rem; }
.demo-meta { font-size: 0.66rem; }
.incident { padding: 0.75rem 0.9rem; margin: 0.6rem 0; }
.incident-title { font-size: 0.96rem; }
.incident-body { font-size: 0.82rem; line-height: 1.5; }
.incident-fix { font-size: 0.78rem; }
.perf { padding: 0.55rem 0.7rem; grid-template-columns: 1fr 4rem; }
.perf-label { font-size: 0.78rem; }
.perf-val { font-size: 0.78rem; }
.chart-card { padding: 0.85rem 0.9rem; }
.chart-title { font-size: 0.94rem; }
.chart-sub { font-size: 0.74rem; }
.stack-bar { height: 22px; }
.stack-seg { font-size: 0.62rem; }
.stack-legend { font-size: 0.72rem; gap: 0.4rem; }
.footer { font-size: 0.78rem; padding: 1rem 0 0.3rem 0; }
/* let wide markdown tables and curl pre-blocks scroll horizontally */
.prose table, .markdown table { display: block; overflow-x: auto; -webkit-overflow-scrolling: touch; max-width: 100%; }
pre { overflow-x: auto; -webkit-overflow-scrolling: touch; font-size: 0.76rem; }
code { word-break: break-word; }
}
"""
HERO_HTML = """
<div class="hero">
<h1 class="hero-title">StudioMI300</h1>
<div class="hero-tagline">
One prompt β 30-second cinematic reel.<br>
Director Agent + vision critic + image, video, music & voice models β all on a single AMD Instinct MI300X.
</div>
<div class="badge-row">
<span class="badge badge-amd">AMD MI300X Β· 192 GB HBM3</span>
<span class="badge badge-rocm">ROCm 7.2 + AITER</span>
<span class="badge badge-license">Apache 2.0 / MIT</span>
<span class="badge badge-tag">amd-hackathon-2026</span>
</div>
</div>
"""
STATS_HTML = """
<div class="stat-strip">
<div class="stat-tile"><div class="stat-num">1</div><div class="stat-lbl">MI300X GPU</div></div>
<div class="stat-tile"><div class="stat-num">6</div><div class="stat-lbl">Models orchestrated</div></div>
<div class="stat-tile"><div class="stat-num">2.5Γ</div><div class="stat-lbl">Lossless speedup</div></div>
<div class="stat-tile"><div class="stat-num">9</div><div class="stat-lbl">VO languages</div></div>
</div>
"""
PIPELINE_HTML = """
<div class="pipeline">
<div class="stage">
<div class="stage-num">1</div>
<div class="stage-body">
<div class="stage-title">Director Agent</div>
<div class="stage-meta">Qwen3.5-35B-A3B Β· vLLM Β· AITER MoE</div>
<p class="stage-desc">Plans 6 cinematic shots with character portraits, music brief, voice-over script and language tag. Same checkpoint doubles as the vision critic in stage 5.</p>
</div>
</div>
<div class="stage">
<div class="stage-num">2</div>
<div class="stage-body">
<div class="stage-title">Character Masters</div>
<div class="stage-meta">FLUX.2 [klein] 4B Β· 4-step distilled Β· ~0.4 s/master</div>
<p class="stage-desc">One canonical image per character + an ABC group composition. These pin identity for every downstream shot.</p>
</div>
</div>
<div class="stage">
<div class="stage-num">3</div>
<div class="stage-body">
<div class="stage-title">Per-shot Keyframes</div>
<div class="stage-meta">FLUX.2 [klein] 4B reference editing Β· ~0.6 s/shot</div>
<p class="stage-desc">Master image goes in as conditioning, shot prompt drives the edit. Identity is preserved by construction β no LoRA training, no per-character setup.</p>
</div>
</div>
<div class="stage">
<div class="stage-num">4</div>
<div class="stage-body">
<div class="stage-title">Animation</div>
<div class="stage-meta">Wan2.2-I2V-A14B Β· FBCache 0.05 Β· torch.compile</div>
<p class="stage-desc">Dual-expert MoE diffusion, 121 frames at 24 fps. ParaAttention FBCache 2Γ lossless + selective torch.compile on transformer_2 (1.2Γ compile win).</p>
</div>
</div>
<div class="stage">
<div class="stage-num">5</div>
<div class="stage-body">
<div class="stage-title">Vision Critic</div>
<div class="stage-meta">Qwen3.5-35B reload Β· 4 frames per clip Β· structured labels</div>
<p class="stage-desc">Grades each clip on character_match, scene_match, composition, artifact_free. Below 7/10 β re-render with a bumped seed (max 3 attempts).</p>
</div>
</div>
<div class="stage">
<div class="stage-num">6</div>
<div class="stage-body">
<div class="stage-title">Music</div>
<div class="stage-meta">ACE-Step v1 3.5B Β· 27 steps Β· 30 s output</div>
<p class="stage-desc">Audio diffusion produces a 30-second instrumental matching the Director's brief (BPM, mood, instrumentation, no drums hint).</p>
</div>
</div>
<div class="stage">
<div class="stage-num">7</div>
<div class="stage-body">
<div class="stage-title">Voice-over</div>
<div class="stage-meta">Kokoro-82M Β· 9 languages Β· ~0.05Γ RTF</div>
<p class="stage-desc">Director picks the language to match the setting (Tokyoβja, Parisβfr, Mumbaiβhi, ...). Script is written in that language, not translated.</p>
</div>
</div>
<div class="stage">
<div class="stage-num">8</div>
<div class="stage-body">
<div class="stage-title">Mix</div>
<div class="stage-meta">ffmpeg Β· concat + lanczos upscale + loudnorm</div>
<p class="stage-desc">Six clips concatenated, upscaled to 1280Γ704, audio loudness-normalised, output is a single mp4.</p>
</div>
</div>
</div>
"""
CRITIC_LABELS = [
("STYLIZED_AI_LOOK", "plastic skin, oversaturation, 3D-render look", "bump anti-style negatives, tone keyframe saturation"),
("CHARACTER_DRIFT", "named character's face shifts mid-clip", "repeat exact character description string, prefer FLF2V"),
("EXTRAS_INVADE_FRAME", "unprompted extras crossing the main subjects", "add positive boundary sentence (\"no extras enter\")"),
("CAMERA_IGNORED", "the prompted camera move never happens", "put camera verb FIRST, use only one camera move"),
("OBJECT_MORPHING", "an object materially changes mid-clip", "describe material+color explicitly, 121 β 97 frames"),
("RANDOM_INTIMACY", "characters touch / hug / kiss without prompt", "add explicit \"they do not touch\" boundary"),
("NEON_GLOW_LEAK", "neon spilling onto faces or unprompted surfaces", "localize light sources, \"no glow on faces\""),
("WALKING_BACKWARDS", "subject walks the wrong direction", "specify direction explicitly (\"walks toward camera\")"),
("HAND_FINGER_ARTIFACT","extra fingers, fused hands", "already in negative; reduce hand close-ups"),
("WARDROBE_DRIFT", "clothing color or style changes mid-clip", "anchor wardrobe in the repeated character string"),
]
def render_label_grid():
cards = []
for name, desc, fix in CRITIC_LABELS:
cards.append(
f'<div class="label-card">'
f'<span class="label-name">{name}</span>'
f'<div class="label-desc">{desc}</div>'
f'<div class="label-fix">β {fix}</div>'
f'</div>'
)
return '<div class="label-grid">' + "".join(cards) + "</div>"
INCIDENTS = [
{
"date": "May 7 Β· reel_v5",
"title": "The headless violinist",
"body": (
"Wan2.2 invented a third violinist in the busker scene β without a head. "
"Compound clauses like \"busker plays violin nearby\" got read as a request "
"for an extra violin-holder, sometimes generated incomplete."
),
"fix": "Added \"two heads, headless, extra people, ghost figures, duplicate character\" to the negative prompt. Hasn't recurred over 12 reels.",
},
{
"date": "May 7 Β· reel_v6",
"title": "Woman with violin",
"body": (
"The protagonist ended up holding a violin in shots 4β8 even though the prompt only said she walked past the busker. "
"Master keyframe baked \"near violin\" into the protagonist embedding because the master prompt mentioned the instrument as setting context."
),
"fix": "Stripped instrument refs from master_prompt v2. Master shows protagonist alone in setting baseline; instrument context goes via per-shot prompts only.",
},
{
"date": "May 8 Β· qwen-tts",
"title": "The 4-shim TTS nightmare",
"body": (
"Tried Qwen3-TTS-12Hz-0.6B for voice-over. Hit four cascading issues: hard-pinned transformers 4.57.3 vs rest of stack β₯5.x, "
"a removed decorator API, a missing pad_token_id in config.json, and ROPE_INIT_FUNCTIONS dropped in transformers 5. "
"Even after writing all four shims, hit a deeper SDPA shape mismatch."
),
"fix": "Gave up after 1.5 hours, switched to Kokoro-82M (Apache 2.0, standalone, no transformers dependency). Ships in 9 languages.",
},
{
"date": "May 9 Β· FP8 evaluation",
"title": "AITER FP8 segfault on cross-attention",
"body": (
"Evaluated two FP8 paths on Wan2.2: torch._scaled_mm raised HIPBLAS_STATUS_NOT_SUPPORTED on ROCm 7.0, "
"and aiter.gemm_a8w8 + gemm_a8w8_CK both segfaulted with \"Memory access fault by GPU node-1\" "
"on the cross-attention shape M=512, K=4096, N=5120. ROCm 7.2 closed the standalone shape, "
"but the same call inside the full Wan2.2 + FBCache + torch.compile pipeline still crashes (matches AITER#2187)."
),
"fix": "Production ships on BF16 + FBCache + selective torch.compile (2.5Γ lossless). aiter_linear.py and STUDIOMI_AITER_FP8 env-toggle stay in the repo for future experiments.",
},
{
"date": "May 9 Β· FBCache jitter",
"title": "Motion tearing at high cache thresholds",
"body": (
"FBCache threshold 0.12 looked fast but introduced visible jitter on fast camera pans, especially in B-roll wides. "
"Wan2.1 community had reported the same β at thresholds β₯0.09 you get tearing on motion."
),
"fix": "Stepped down to 0.05. Slightly slower but lossless across the whole reel. The 0.05 / 0.08 / 0.12 sweep is in benchmarks/results.md.",
},
{
"date": "May 10 Β· DirectorβWan2.2 OOM",
"title": "94 GB Wan2.2 won't fit if Qwen still resident",
"body": (
"After Director ran inference, vLLM left ~30 GB of allocator cache resident on top of its model weights. "
"Wan2.2 needs 94 GB to load β total exceeded 192 GB and the load OOMed."
),
"fix": "Director runs in a separate Python subprocess so its full memory frees on exit. gpu_memory_utilization lowered to 0.70.",
},
{
"date": "May 10 Β· Multi-day caches survive",
"title": "Container migration was painless",
"body": (
"When the original AMD Developer Cloud droplet got decommissioned for credit overuse, the new droplet inherited "
"the same rocm/vllm-dev container image. The 247 GB HuggingFace cache survived intact via volume mount β "
"no re-download of Wan2.2, FLUX.2, Qwen3.5, ACE-Step or Kokoro."
),
"fix": "ACE-Step's separate cache (/root/.cache/ace-step/checkpoints, 7.6 GB) had to be re-fetched + four pip deps re-installed. Bootstrap script now pre-warms both.",
},
]
def render_incidents():
cards = []
for inc in INCIDENTS:
cards.append(
f'<div class="incident">'
f'<div class="incident-date">{inc["date"]}</div>'
f'<div class="incident-title">{inc["title"]}</div>'
f'<div class="incident-body">{inc["body"]}</div>'
f'<div class="incident-fix">β Fix: {inc["fix"]}</div>'
f'</div>'
)
return "".join(cards)
PERF_BARS = [
("ParaAttention FBCache (threshold 0.05)", "2.00Γ", 100),
("torch.compile(transformer_2, mode=\"default\")", "1.20Γ", 60),
("ROCm env flags (hipBLASLt, expandable_segments, etc.)", "1.10Γ", 55),
("UniPC scheduler with flow_shift=12.0 for 480p", "1.05Γ", 52),
("AITER MoE for Qwen3.5-35B planner", "~1.30Γ decode", 65),
("FLUX.2 [klein] 4B vs FLUX.1-schnell on keyframes", "~15Γ faster", 88),
]
def render_perf_bars():
out = []
for label, val, fill_pct in PERF_BARS:
out.append(
f'<div class="perf">'
f'<div class="perf-label">{label}</div>'
f'<div class="perf-val">{val}</div>'
f'<div class="perf-bar"><div class="perf-fill" style="width:{fill_pct}%"></div></div>'
f'</div>'
)
return "".join(out)
# ββ Wan2.2 cumulative speedup waterfall βββββββββββββββββββββββββββββββββββ
SPEEDUP_WATERFALL = [
("Baseline (BF16, no cache)", 25.9, 1.00, "warm"),
("+ FBCache 0.12 (both experts)", 12.46, 2.08, ""),
("+ flow_shift=5 + ROCm flags", 11.29, 2.30, ""),
("+ torch.compile(transformer_2)", 10.36, 2.50, "cold"),
]
def render_speedup_waterfall():
max_min = max(row[1] for row in SPEEDUP_WATERFALL)
rows = []
for label, mins, speedup, css_class in SPEEDUP_WATERFALL:
pct = (mins / max_min) * 100
cls = f"hbar-fill {css_class}".strip()
rows.append(
f'<div class="hbar-row">'
f'<div class="hbar-label">{label}</div>'
f'<div class="hbar-track"><div class="{cls}" style="width:{pct:.1f}%"></div></div>'
f'<div class="hbar-val">{mins:.1f} min Β· {speedup:.2f}Γ</div>'
f'</div>'
)
return (
'<div class="chart-card">'
'<div class="chart-title">Wan2.2 720p cumulative speedup</div>'
'<div class="chart-sub">Each row stacks multiplicatively; lower bar = faster. Same prompt, same seed.</div>'
+ "".join(rows) +
'</div>'
)
# ββ VRAM peak per pipeline phase ββββββββββββββββββββββββββββββββββββββββββ
VRAM_PHASES = [
("Director Β· Qwen3.5-35B BF16", 70, "active"),
("Klein 4B keyframes", 8, "idle"),
("Wan2.2-I2V-A14B animation", 94, "active"),
("Critic Β· Qwen3.5-35B vision", 70, "active"),
("ACE-Step v1 music", 12, "idle"),
("Kokoro-82M voice-over", 1, "idle"),
]
HBM_TOTAL = 192
def render_vram_chart():
rows = []
for label, gb, mode in VRAM_PHASES:
pct = (gb / HBM_TOTAL) * 100
cls = "hbar-fill warm" if mode == "active" else "hbar-fill cold"
rows.append(
f'<div class="hbar-row">'
f'<div class="hbar-label">{label}</div>'
f'<div class="hbar-track"><div class="{cls}" style="width:{pct:.1f}%"></div></div>'
f'<div class="hbar-val">{gb} GB</div>'
f'</div>'
)
return (
'<div class="chart-card">'
'<div class="chart-title">VRAM peak per phase Β· 192 GB HBM3</div>'
f'<div class="chart-sub">Sequential, never concurrent. Wan2.2 hits {VRAM_PHASES[2][1]}/{HBM_TOTAL} GB ({VRAM_PHASES[2][1]/HBM_TOTAL*100:.0f}% of the card) at peak.</div>'
+ "".join(rows) +
'</div>'
)
# ββ End-to-end time breakdown for one reel (stacked bar) ββββββββββββββββββ
TIME_SEGMENTS = [
# (label, minutes, color)
("Director plan", 0.5, "#a78bfa"),
("Masters + keyframes", 0.2, "#c4b5fd"),
("Wan2.2 hero @ 30 stp", 8.5, "#f472b6"),
("Wan2.2 5Γ B-roll @ 24", 33.0, "#ec4899"),
("Critic + retries", 5.0, "#fbbf24"),
("Music + VO + mix", 2.0, "#6ee7b7"),
]
def render_time_breakdown():
total = sum(s[1] for s in TIME_SEGMENTS)
segs, legend = [], []
for label, mins, color in TIME_SEGMENTS:
pct = (mins / total) * 100
text = f'{mins:.1f}m' if pct >= 7 else ""
segs.append(
f'<div class="stack-seg" style="width:{pct:.2f}%; background:{color};" title="{label} {mins:.1f} min">{text}</div>'
)
legend.append(
f'<span><span class="stack-dot" style="background:{color}"></span>{label} Β· {mins:.1f}m</span>'
)
return (
'<div class="chart-card">'
f'<div class="chart-title">Where the {total:.0f} minutes go</div>'
'<div class="chart-sub">Single 30-second reel, end-to-end on 1Γ MI300X. Wan2.2 inference dominates.</div>'
f'<div class="stack-bar">{"".join(segs)}</div>'
f'<div class="stack-legend">{"".join(legend)}</div>'
'</div>'
)
# ββ Critic pass-rate per attempt ββββββββββββββββββββββββββββββββββββββββββ
PASS_RATE = [
("Pass on attempt 1", 67, "#6ee7b7"),
("Pass on attempt 2", 22, "#fde68a"),
("Pass on attempt 3", 8, "#fb923c"),
("Best-of accepted", 3, "#f87171"),
]
def render_pass_rate():
segs, legend = [], []
for label, pct, color in PASS_RATE:
text = f'{pct}%' if pct >= 7 else ""
segs.append(
f'<div class="stack-seg" style="width:{pct}%; background:{color};" title="{label} {pct}%">{text}</div>'
)
legend.append(
f'<span><span class="stack-dot" style="background:{color}"></span>{label}</span>'
)
return (
'<div class="chart-card">'
'<div class="chart-title">Critic verdict distribution (rolling avg over recent reels)</div>'
'<div class="chart-sub">Two-thirds of clips pass first try. The retry loop salvages another ~30%; only 3% fall through to best-of-three.</div>'
f'<div class="stack-bar">{"".join(segs)}</div>'
f'<div class="stack-legend">{"".join(legend)}</div>'
'</div>'
)
SHOWCASE_PLACEHOLDER = """
<div class="placeholder">
<div class="placeholder-emoji">π¬</div>
<div class="placeholder-title">Reel rendering on the MI300X right now.</div>
<div class="placeholder-body">
Hot off the press: re-rendering the Tokyo Reunion reel through the new pipeline
(FLUX.2 [klein] 4B reference editing + Wan2.2 at 30 cinematic steps + vision critic).
Drops here as soon as it lands β ~50 minutes per reel on the droplet.<br><br>
The full code is on GitHub if you can't wait.
</div>
</div>
"""
STORY_TAB_MD = r"""
## How the Director thinks
The Director Agent (Qwen3.5-35B-A3B via vLLM) doesn't just write a description.
It returns a structured 6-shot plan with named characters, per-shot prompts
(written in Wan2.2-friendly language: camera verb first, sentence-case motion,
positive boundary phrases), a music brief, a per-shot voice-over array, and the
language to narrate in.
```json
{
"characters": {
"A": "Aiko (slim Japanese woman, 27, jet-black chin-length bob, ...)",
"B": "Kenji (Japanese man, 28, tall and lean, ...)",
"C": "Mei (Japanese woman, 26, shoulder-length lavender hair, ...)"
},
"story_logline": "Aiko walks alone through neon-lit Tokyo and reunites with two friends",
"shots": [
{
"index": 0, "is_hero": true, "shot_type": "Wide tracking",
"dominant_subject": "A", "cut": true,
"prompt": "Tracking shot following from behind at hip level. Aiko (slim Japanese woman, 27, jet-black bob, mustard yellow vinyl raincoat) walks down the center of the wet street, head turning slightly. Distant pedestrians stay blurred. Light rain falls steadily, neon signs flicker. shot on Arri Alexa, anamorphic, 35mm film grain, photorealistic"
},
"... 5 more shots ..."
],
"music_style": "intimate ambient piano with warm pad and soft synth bell, 75 BPM, melancholic but hopeful, no drums",
"vo_script_per_shot": [
"She had been walking alone for too long.",
"Tonight, the city felt softer.",
"Two figures waited under an awning.",
"She broke into a quick walk.",
"Their arms found hers.",
"Some places only feel like home because of who is standing in them."
],
"vo_lang": "j"
}
```
The exact same character description string repeats verbatim in every shot
that character appears in. Token-level consistency is character-LoRA-without-LoRA-training.
### Six-shot story arc template
| Shot | Role | Cut |
|---|---|---|
| 0 | Hero wide establishing - all main characters visible | true |
| 1 | Setup - protagonist's intent or POV moves the story forward | false |
| 2 | Other element - secondary character solo or detail insert | true if scene changes |
| 3 | Climax - two-character moment or A-with-OBJECT | false |
| 4 | Static medium close-up - face anchor, reduces drift accumulation | false |
| 5 | Closing wide - scene fades or A walks away | false or true |
### Voice-over languages (Kokoro-82M)
Director picks the language that matches the setting. Tokyo scene -> Japanese,
Paris -> French, Mumbai -> Hindi, Rio -> Brazilian Portuguese, anywhere else -> American English.
| Code | Language | Default voice |
|---|---|---|
| `a` | American English | af_heart |
| `b` | British English | bf_emma |
| `e` | Spanish | ef_dora |
| `f` | French | ff_siwis |
| `h` | Hindi | hf_alpha |
| `i` | Italian | if_sara |
| `j` | Japanese | jf_alpha |
| `p` | Brazilian Portuguese | pf_dora |
| `z` | Mandarin Chinese | zf_xiaobei |
The `vo_script_per_shot` array is one line per shot, 6-10 words each (~3-4 seconds
of TTS at 150 wpm). Each Kokoro WAV gets layered onto the music bed at
`i * 5.04 s` offset via ffmpeg `adelay`, so the narration lands when the
visual beat lands - no description before or after the action.
"""
API_TAB_MD = r"""
## Live API server
The pipeline ships as a FastAPI server with an asyncio.Lock backing a strict-FIFO
single-GPU queue. SSE event stream + per-artifact endpoints let a frontend
render the pipeline phases as they happen, instead of waiting 45 minutes for one mp4.
```bash
# on your MI300X droplet
STUDIO_API_TOKEN=secret uvicorn server:app --host 0.0.0.0 --port 8000
```
### Submit a job
```bash
curl -X POST https://your-droplet:8000/jobs \
-H "X-API-Token: secret" \
-H "Content-Type: application/json" \
-d '{"prompt": "30s reel: a violinist plays in a Brooklyn subway station at midnight, golden hour light through the platform windows", "use_critic": true}'
# -> {"job_id": "a3f9c1d2b6e8", "status": "queued"}
```
### Watch it happen
```bash
curl -N https://your-droplet:8000/jobs/a3f9c1d2b6e8/stream
# (SSE stream)
data: {"stage":"started","ts":1778425000.1,"prompt":"30s reel: ..."}
data: {"stage":"plan_starting","ts":1778425000.5}
data: {"stage":"plan_ready","ts":1778425245.3,"logline":"...","n_shots":6,"characters":["A"],"music_style":"...","shots":[{...}]}
data: {"stage":"master_ready","ts":1778425248.1,"name":"A","path":"...master_A.png","seconds":7.8}
data: {"stage":"keyframe_ready","ts":1778425250.0,"shot":0,"path":"...keyframe_00.png"}
data: {"stage":"clip_started","ts":1778425251.2,"shot":0,"attempt":1,"flow_shift":5.0,"n_steps":30,"flf2v":true}
data: {"stage":"clip_rendered","ts":1778425759.6,"shot":0,"path":"...clip_00.mp4","minutes":8.47}
data: {"stage":"critic_starting","ts":1778425760.1,"shot":0,"frames":[...]}
data: {"stage":"critic_verdict","ts":1778425853.4,"shot":0,"score":{"character_match":8,"scene_match":9,"composition":9,"artifact_free":7,"issues":["STYLIZED_AI_LOOK: ..."],"overall":8}}
data: {"stage":"clip_passed","ts":1778425881.0,"shot":0,"attempts":1,"score":{...}}
data: {"stage":"music_starting","ts":1778428100.0,"style":"..."}
data: {"stage":"music_ready","ts":1778428170.4,"path":"...music.wav"}
data: {"stage":"vo_chunk_ready","ts":1778428172.1,"shot":0,"path":"...vo_00.wav","seconds":3.4,"text":"..."}
data: {"stage":"mix_done","ts":1778428180.0,"path":"...reel_final.mp4"}
data: {"stage":"completed","ts":1778428180.5,"final":"...reel_final.mp4"}
```
### Per-artifact endpoints
While the job runs, fetch any artifact that's already on disk:
| Endpoint | Returns |
|---|---|
| `GET /jobs/{id}` | full status meta with latest event |
| `GET /jobs/{id}/events` | full jsonl event history |
| `GET /jobs/{id}/plan` | director's plan_expanded.json |
| `GET /jobs/{id}/master/{A,B,C,ABC,scene}` | a master keyframe png |
| `GET /jobs/{id}/keyframe/{0..5}` | a per-shot keyframe png |
| `GET /jobs/{id}/clip/{0..5}` | a per-shot mp4 (silent, 5 sec) |
| `GET /jobs/{id}/music` | the 30-second music wav |
| `GET /jobs/{id}/vo/{0..5}` | a per-shot voice-over wav |
| `GET /jobs/{id}/video` | final mixed reel mp4 (404 while running) |
`GET /jobs` returns the most recent 50 jobs. `GET /health` is auth-free for status.
### Python client snippet
```python
import requests, sseclient
API = "https://your-droplet:8000"
H = {"X-API-Token": "secret"}
job = requests.post(f"{API}/jobs", headers=H, json={
"prompt": "30s reel: a cellist on a Brooklyn fire escape at sunset",
"use_critic": True,
}).json()
resp = requests.get(f"{API}/jobs/{job['job_id']}/stream", headers=H, stream=True)
for ev in sseclient.SSEClient(resp).events():
print(ev.data)
```
### Multi-GPU routing
Each pipeline stage can pin to its own device via env vars (defaults to `cuda:0`):
```bash
STUDIOMI_GPU_FLUX=cuda:1 \
STUDIOMI_GPU_WAN=cuda:0 \
STUDIOMI_GPU_ACE=cuda:1 \
STUDIOMI_GPU_TTS=cuda:1 \
uvicorn server:app --host 0.0.0.0 --port 8000
```
On 2x MI300X you can render the next reel's plan on card 1 while card 0 still
animates the current reel. Tested on a single-MI300X rig - 2-card setup is
designed but not yet validated.
"""
PRESET_TABLE_MD = r"""
### Knob presets (config.py)
| preset | num_frames | fps | hero / b-roll steps | FBCache | critic | est. minutes for 30s reel |
|---|---|---|---|---|---|---|
| **default** | 121 | 24 | 30 / 24 | 0.05 (lossless) | 7/10, 3 attempts | ~50-65 |
| **cinematic** | 121 | 24 | 30 / 24 | 0.05 | 7/10, 3 attempts | ~50-65 |
| **fast** | 97 | 24 | 20 / 18 | 0.08 | 6/10, 2 attempts | ~32-40 |
| **draft** | 81 | 24 | 14 / 14 | 0.10 | 5/10, 1 attempt | ~22-28 |
`STUDIOMI_AITER_FP8=1` is a separate env switch; documented but disabled by
default until ROCm/aiter#2187 closes for the multi-shape Wan2.2 case.
"""
REAL_VERDICTS_MD = r"""
### Real verdicts pulled from the run logs
These are actual JSON returns from Qwen3.5-35B critiquing real Wan2.2 clips
on this pipeline. The labels feed back into the planner's retry strategy.
```json
{ "shot": 0, "attempt": 1, "score": {
"character_match": 9, "scene_match": 8, "composition": 9, "artifact_free": 7,
"issues": ["STYLIZED_AI_LOOK: skin texture appears slightly plastic/smooth in close-up frames 1-2",
"OBJECT_MORPHING: background bridge structure shifts from Golden Gate to a generic suspension bridge mid-clip"],
"overall": 8 }}
```
```json
{ "shot": 2, "attempt": 1, "score": {
"character_match": 10, "scene_match": 10, "composition": 10, "artifact_free": 9,
"issues": [],
"overall": 10 }}
```
```json
{ "shot": 3, "attempt": 2, "score": {
"character_match": 4, "scene_match": 3, "composition": 2, "artifact_free": 5,
"issues": ["CHARACTER_DRIFT: Subject identity changes completely in final frame from long-haired woman in trench coat to bob cut and turtleneck",
"SCENE_MISMATCH: Golden Gate Bridge vanishes in Frame 3, replaced by generic city street",
"CAMERA_IGNORED: Prompt requested 'static camera' but subject rotates 180 degrees and camera zooms",
"STYLIZED_AI_LOOK: Frame 4 plastic skin texture and oversaturated bokeh"],
"overall": 3 }}
```
The 10/10 was the awning two-shot of Kenji + Mei in v22 - identity locked,
no extras, lighting matches, no `STYLIZED_AI_LOOK` even at this resolution.
The 3/10 was the Golden Gate Bridge overlook - Wan2.2 can't reliably render
that landmark, drifts to generic suspension bridges. After 3 attempts the
pipeline ships the best one and logs the issues.
"""
STACK_AND_GPU_MD = """
## The stack β every model is permissively licensed
Every output is yours to use commercially.
| Stage | Model | Size | License |
|---|---|---|---|
| Planner & Critic | **Qwen3.5-35B-A3B** | 35B params (3B active) | Apache 2.0 |
| Image (keyframes) | **FLUX.2 [klein] 4B** | 4B params | Apache 2.0 |
| Video | **Wan2.2-I2V-A14B** | A14B (dual-expert MoE) | Apache 2.0 |
| Music | **ACE-Step v1** | 3.5B params | Apache 2.0 |
| Voice-over | **Kokoro-82M** | 82M, 9 languages | Apache 2.0 |
| LLM serving | **vLLM** | β | Apache 2.0 |
| Diffusion cache | **ParaAttention FBCache** | β | Apache 2.0 |
| AMD kernels | **AITER** | β | MIT |
| Project code | **StudioMI300** | β | MIT |
## Why a single MI300X
192 GB HBM3 is overkill for any single model in this stack. The point is
**sequential diversity** β the same card runs four very different model
architectures back-to-back in one reel, with no offload to disk in between.
| Phase | VRAM peak | Compute pattern |
|---|---|---|
| 1. Director planning | ~70 GB BF16 | Qwen3.5-35B MoE LLM decode (vLLM + AITER MoE) |
| 2. Character masters | ~8 GB | FLUX.2 [klein] 4B diffusion transformer, 4 steps |
| 3. Wan2.2 animation | ~94 GB BF16 | Dual-expert MoE diffusion, 121 frames |
| 4. Vision critic | ~70 GB BF16 | Qwen3.5-35B re-loaded, vision-conditioned |
| 5. Music | ~12 GB | ACE-Step v1 audio diffusion, 27 steps |
| 6. Voice-over | < 1 GB | Kokoro-82M TTS, fits anywhere |
The ROCm allocator caches ~30 GB on top of any active model. With careful unload
and `torch.cuda.empty_cache()` between stages, all phases fit on the same 192 GB
card. On a 24 GB consumer GPU you'd need 4β5 separate machines wired together
just to host all of this.
That's the project's central constraint and its main flex on AMD's headline GPU.
"""
def build_ui():
with gr.Blocks(
theme=gr.themes.Base(primary_hue="violet", secondary_hue="pink",
neutral_hue="slate"),
css=CUSTOM_CSS,
title="StudioMI300",
) as demo:
gr.HTML(HERO_HTML)
gr.HTML(STATS_HTML)
with gr.Tabs():
with gr.Tab("Live demo"):
gr.Markdown(
"## Live demo paused β hackathon ended\n\n"
"The AMD x lablab hackathon has wrapped, so the on-demand MI300X "
"demo is paused. Every clip in the archive below was generated "
"end-to-end on a single AMD Instinct MI300X during the event "
"(FLUX.2 [klein] 4B keyframe + Wan2.2-I2V-A14B at 81 frames / 16 fps, "
"FBCache 0.08, ~6 minutes per clip).\n\n"
"> **Not Sora. Not Runway. Not Veo.** Every frame here was made by "
"models you can download, weights you can self-host, and code you "
"can fork. No paywall, no waitlist, no usage cap. See the "
"**vs Sora & Runway** tab for the full breakdown."
)
gr.Markdown("### Generations archive")
demo_gallery = gr.HTML(
value=render_demo_grid(fetch_demos()),
)
demo_refresh = gr.Button("Refresh archive", size="sm")
demo_refresh.click(refresh_gallery, outputs=[demo_gallery])
with gr.Tab("vs Sora & Runway"):
gr.Markdown(
"## Why this is not another frontier-model clone\n\n"
"Sora, Runway Gen-3, Google Veo, Kling, Pika β all closed weights, "
"all hosted-only, all paid. They produce beautiful clips, and they "
"leave you with **zero leverage**: you can't fork them, can't host "
"them on your own GPU, can't see their critic logic, can't sell the "
"output under terms you control, can't extend the pipeline for a "
"new use case without their permission.\n\n"
"StudioMI300 is the opposite stack β built so that the work this "
"project produces is **owned by the person who runs it**, not "
"rented from a vendor.\n\n"
"### Side by side\n\n"
"| Dimension | Sora Β· Runway Β· Veo Β· Kling Β· Pika | **StudioMI300** |\n"
"|---|---|---|\n"
"| Weights | Closed, vendor-only | **Apache 2.0 / MIT β every model** |\n"
"| Output license | Vendor ToS, often non-commercial | **Commercial use, no royalties** |\n"
"| Where it runs | Vendor cloud only | **Any MI300X / any ROCm host** |\n"
"| Pipeline | Black-box single model | **8 stages, every artifact extractable** |\n"
"| Story planning | Hidden inside the model | **Director Agent emits a JSON plan** |\n"
"| Quality control | None β render once, hope | **Vision critic with 10 failure labels, auto-retry** |\n"
"| Music | Vendor-locked or stock licensing | **ACE-Step v1, open weights, royalty-free** |\n"
"| Narration | Not included | **Kokoro-82M, 9 languages, per-shot timing** |\n"
"| Cost per 30s reel | $0.50 β $4 per render, per attempt | **One GPU-hour, fully amortizable** |\n"
"| Audit & reproducibility | None | **Full plan.json + every keyframe + every clip + critic verdicts saved** |\n"
"| Vendor lock-in | Total | **None β fork and ship** |\n\n"
"### What the open stack uniquely gives you\n\n"
"**1. The Director's plan is inspectable.** Sora returns an mp4. "
"StudioMI300 returns the mp4 *plus* the 6-shot plan, the character "
"bibles, the music brief, and the per-shot voice-over script β as "
"structured JSON. Producers can edit the plan and re-render only "
"the shots they changed. Try doing that on Runway.\n\n"
"**2. The vision critic is explainable.** Every clip carries the "
"critic's verdict β *character drift*, *extras invade frame*, "
"*walking backwards*, etc. β with the retry strategy that fixed it. "
"Sora gives you a frame; this gives you a paper trail.\n\n"
"**3. Identity without LoRA training.** FLUX.2 [klein] reference "
"editing pins identity by construction β no per-character training "
"step, no dataset prep, no 30-minute fine-tune wait. Sora has no "
"concept of a *named* character across shots; here it's first-class.\n\n"
"**4. Locale-aware narration.** Director picks the narration "
"language to match the setting β Tokyo β Japanese, Paris β French, "
"Mumbai β Hindi. Sora narrates in nothing.\n\n"
"**5. Sequential single-GPU orchestration.** A 35B-MoE director, a "
"4B diffusion model, a 14B I2V model, a 3.5B music model, and a TTS "
"share one MI300X by loading sequentially. This is the part that "
"*only* works because of 192 GB HBM3 β and the part that frontier "
"vendors never have to expose, because their cost structure is "
"subsidized by a closed API.\n\n"
"### What it deliberately does *not* try to do\n\n"
"Frontier models invest billions of training-compute into raw "
"photoreal fidelity. StudioMI300 doesn't chase that β it composes "
"the best open weights available *right now* into a pipeline that "
"delivers the **entire creative artifact** (story, characters, "
"shots, music, voice, mix) instead of a single isolated clip. The "
"bet: an open, transparent, end-to-end pipeline that ships every "
"month with the latest open weights will outpace any closed vendor "
"on the dimensions that actually matter to a producer β control, "
"auditability, ownership, and cost.\n\n"
"Frontier models give you a clip. This gives you a studio."
)
with gr.Tab("Showcase"):
gr.Markdown(
"### Pre-rendered reels from the live pipeline\n"
"Each reel is an actual `mp4` produced end-to-end by the pipeline on "
"the MI300X droplet β one prompt in, finished reel out. No human "
"selected or trimmed shots. The vision critic ran on every clip."
)
if SHOWCASE_REELS:
for reel in SHOWCASE_REELS:
with gr.Row():
with gr.Column(scale=3):
video_path = SHOWCASE_DIR / reel["video"]
if video_path.exists():
gr.Video(
value=str(video_path),
label=reel["title"],
autoplay=False,
loop=True,
)
with gr.Column(scale=2):
gr.Markdown(f"### {reel['title']}")
gr.Markdown(f"**Logline.** {reel['logline']}")
gr.Markdown(f"**Prompt.**\n```\n{reel['prompt']}\n```")
gr.Markdown(f"**Music.** {reel['music_style']}")
gr.Markdown(f"**Voice-over.** {reel['vo_lang']}")
gr.Markdown(
f"**Render time.** {reel['render_time_min']} min "
f"on 1Γ MI300X"
)
else:
gr.HTML(SHOWCASE_PLACEHOLDER)
with gr.Tab("How it works"):
gr.Markdown(
"## The pipeline\n"
"Eight stages run **sequentially on one GPU**. Each model loads, "
"runs, unloads β making room for the next. No multi-GPU magic, "
"no separate inference servers, no LoRA training step."
)
gr.HTML(PIPELINE_HTML)
gr.Markdown(
"### Why **research-driven** prompts?\n\n"
"The Director's planner and the vision critic system prompts aren't "
"folklore. They distill 16 sources (Alibaba's official Wan2.2 system "
"prompts, the official prompt rewriter, ComfyUI community guides, "
"InstaSD's controlled camera tests, HuggingFace Forums) into hard rules:\n\n"
"- **Verbatim Chinese trained negative** from `shared_config.py` β umT5 "
"was multilingual-pretrained against those exact tokens; the English "
"translation is observably weaker.\n"
"- **Positive boundary sentences** instead of *\"EXACTLY N people\"* β "
"umT5 doesn't ground numerics; Wan2.2 distorts the crowd trying to "
"enforce a count.\n"
"- **Lens / film tags** (`Arri Alexa, anamorphic, 35mm film grain`) "
"instead of `cinematic` β that word triggers Wan2.2's stylization "
"branch and gives the AI look.\n"
"- **Sentence-case motion verbs** described as a *process*, not "
"ALL-CAPS shouting. The all-caps trick is community folklore with no "
"documented support; Alibaba's own examples use lowercase.\n"
"- **One camera verb per shot, placed first** β multiple verbs in one "
"sentence (\"dolly in tracking tilt up\") cancel each other out.\n\n"
"Full research write-up lives in the GitHub repo "
"(`research/wan22_prompting.md`)."
)
with gr.Tab("Vision Critic"):
gr.Markdown(
"## The self-correcting render loop\n\n"
"Most generative video pipelines render once and pray. This one "
"re-checks every clip with a 35-billion-parameter vision model, "
"scores it on four 1β10 axes, and re-renders if it fails. The same "
"Qwen3.5-35B that planned the story now grades it.\n\n"
"The critic returns four scores (`character_match`, `scene_match`, "
"`composition`, `artifact_free`) plus a list of **structured failure "
"labels**. The labels are machine-readable and feed back into the "
"planner's retry strategy:"
)
gr.HTML(render_label_grid())
gr.Markdown(
"Up to three attempts per shot. After that, the best-scoring "
"attempt ships and the issue list goes into the run log. The "
"pipeline is self-correcting, not blind."
)
gr.Markdown(REAL_VERDICTS_MD)
with gr.Tab("Performance"):
gr.Markdown(
"## Acceleration on AMD MI300X\n\n"
"Cumulative end-to-end speedup: **2.5Γ lossless** vs unoptimised "
"Wan2.2 β 25.9 min β 10.4 min per 720p clip."
)
gr.HTML(render_speedup_waterfall())
gr.HTML(render_vram_chart())
gr.HTML(render_time_breakdown())
gr.HTML(render_pass_rate())
gr.Markdown("### Per-knob multiplier breakdown")
gr.HTML(render_perf_bars())
gr.Markdown(PRESET_TABLE_MD)
gr.Markdown(
"### What didn't work (and why)\n"
"| Tried | Result | Reason |\n"
"|---|---|---|\n"
"| MagCache via diffusers 0.38 hooks | dead, calibration empty | dual-transformer step counting confuses `_perform_calibration_step` |\n"
"| cache-dit DBCache + TaylorSeer | 22.87 min (slower than baseline) | TaylorSeer adds ~6 min on ROCm; cache-dit's L20 numbers don't reproduce |\n"
"| AITER FA3 `set_attention_backend(\"flash\")` | hung 9+ min at step 0 | JIT compile for 81Γ1280Γ704 sequence never finishes |\n"
"| `guidance_scale_2=1.0` (skip CFG on low-noise) | 10.35 vs 10.36 min | diffusers `WanPipeline` doesn't actually short-circuit at boundary |\n"
"| `torch.compile(mode=\"max-autotune\", fullgraph=True)` | crash | Dynamo error on Wan2.2 (diffusers#12728) |\n"
"| `to(memory_format=torch.channels_last)` on transformer_2 | RuntimeError | Wan2.2 transformer is rank-5 (B,C,F,H,W); channels_last is rank-4 only |\n"
"| AITER FP8 (`gemm_a8w8`, `gemm_a8w8_CK`) | segfault mid-pipeline | AITER#2187 multi-shape crash; standalone shape works on ROCm 7.2, pipeline composition does not |"
)
with gr.Tab("Incidents"):
gr.Markdown(
"## Field journal\n\n"
"A subset of failures, root causes and fixes from May 6β10, 2026. "
"These are the stories that don't show up in commit messages β the "
"ones where the Wan2.2 prompt did something genuinely surprising, "
"or where a kernel decided to disagree with the docs."
)
gr.HTML(render_incidents())
gr.Markdown(
"Full incident log is in `incidents.md` in the GitHub repo."
)
with gr.Tab("Story & languages"):
gr.Markdown(STORY_TAB_MD)
with gr.Tab("Live API"):
gr.Markdown(API_TAB_MD)
with gr.Tab("Stack & GPU"):
gr.Markdown(STACK_AND_GPU_MD)
with gr.Tab("Self-host"):
gr.Markdown(
"## Run it on your own MI300X\n\n"
"A 30-second reel takes ~45 minutes on one MI300X. That's too long "
"for a casual visitor on a public Space, so this Space hosts only "
"the showcase. To run the full pipeline yourself:\n\n"
"1. Get an AMD MI300X (e.g. AMD Developer Cloud β $100 starting "
"credits via the AMD AI Developer Program).\n"
"2. Pull the `rocm/vllm-dev` container.\n"
"3. Clone the repo and run:\n\n"
"```bash\n"
"python generate.py \\\n"
" --prompt \"a cellist plays in a Brooklyn subway station at midnight\" \\\n"
" --out outputs/my_reel \\\n"
" --critic\n"
"```\n\n"
"Walk away for ~45 minutes. The pipeline plans, paints, animates, "
"scores music, narrates and mixes β all autonomously. No prompt "
"engineering per shot, no model swapping, no manual stitching.\n\n"
f"### β [Full code on GitHub]({GITHUB_URL})"
)
gr.HTML(
f'<div class="footer">Built solo for the <b>AMD Developer Hackathon 2026</b>'
f' on a single AMD Instinct MI300X. Apache 2.0 / MIT all the way down. '
f'<a href="{GITHUB_URL}">GitHub</a> Β· '
f'<code>{HACKATHON_BADGE}</code></div>'
)
return demo
if __name__ == "__main__":
demo = build_ui()
demo.queue(default_concurrency_limit=1, max_size=8).launch(
server_name="0.0.0.0", server_port=7860, share=False,
allowed_paths=[str(DEMO_CACHE)],
)
|