Spaces:

moonlantern1
/

Grabby-Voice-Classic

Running

App Files Files Community

moonlantern1 commited on 7 days ago

Commit

089c9df

1 Parent(s): bf00623

Make classic flow audio first

Browse files

Files changed (7) hide show

README.md +19 -22
src/app/api/public/reviews/submit-clips/route.ts +2 -2
src/app/c/[slug]/LandingClient.tsx +3 -3
src/app/c/[slug]/record/GuidedRecordingClient.tsx +1 -1
src/lib/ffmpeg.ts +1 -1
src/lib/server/reviewStore.ts +15 -42
src/lib/server/serverClipRenderer.ts +95 -124

README.md CHANGED Viewed

@@ -10,8 +10,8 @@ pinned: false
 # Matcha Moments — frontend
-Cafe-aesthetic, mobile-first PWA that walks a customer through a 5-clip guided
-video review and, on submit, hands them a matcha redemption code.
 This repo is a **standalone Next.js app**. It calls Humeo's deployed public
 review APIs (`https://humeo.app/api/public/reviews/*`) — no backend changes
@@ -25,7 +25,7 @@ It's gitignored, so it never ships with this repo.
 ## Tech stack
 - **Next.js 14** (App Router) + TypeScript + Tailwind CSS
-- **`@ffmpeg/ffmpeg`** (ffmpeg.wasm) — client-side concatenation of the 5 recorded clips into one video before upload
 - **`getUserMedia` + `MediaRecorder`** — standard browser camera APIs (no native install required)
 - **`zod`** — shared validation schemas, mirrors the ones in Humeo's `src/lib/reviews/types.ts`
@@ -39,8 +39,8 @@ flow opens in 2 seconds, no install, works on iOS Safari and Android Chrome.
 1. `/` — QR landing context screen (dev-only; in prod, the cafe's QR deep-links straight to `/c/[slug]`)
 2. `/c/[slug]` — Cafe landing: brand, big "Free matcha, on the house" headline, consent copy, primary CTA
-3. `/c/[slug]/record` — Guided 5-clip recorder (video preview → prompt card → record button → auto-advance)
-4. `/preview` — ffmpeg.wasm stitches the clips, uploads to Humeo, polls submission status, shows the rendered preview
 5. `/reward` — Confetti, reward code in a dark card, "show this screen to your server"
 The `[slug]` route is a real Next.js dynamic segment that fetches its campaign
@@ -80,7 +80,7 @@ matcha-moments PWA                   Humeo backend (deployed)
 GET  /c/[slug]            ──────►    GET  /api/public/reviews/campaign/[slug]
                           ◄──────    { id, slug, restaurantName, rulesConfig, ... }
-stitch clips locally (ffmpeg.wasm)
 POST /preview submit      ──────►    POST /api/public/reviews/submit
                                        FormData: video, slug, consentAccepted,
@@ -107,21 +107,18 @@ Until then the matcha-moments app silently injects the cafe defaults.
 ---
-## Why client-side ffmpeg.wasm?
-Humeo's `/api/public/reviews/submit` accepts a single video file. We want a
-multi-clip guided UX without forking Humeo's submit flow. Stitching the 5
-recordings in the browser solves that with zero backend changes.
-Trade-offs:
-- 8MB WASM download, lazy-loaded only after the customer finishes recording
-- 3-6 seconds of stitch time on a modern phone for ~50s of total video
-- `next.config.js` sets COOP/COEP headers (required for `SharedArrayBuffer`)
-If cafe staff start hearing complaints about phone heat, swap to a
-multi-clip upload + server-side ffmpeg endpoint. Humeo's worker
-(`reference/src/lib/server/processInterview.ts`) already uses ffmpeg, so the
-migration is mostly a new submit endpoint.
 ---
@@ -135,9 +132,9 @@ src/
     c/[slug]/
       page.tsx                    Server component — fetches campaign
       LandingClient.tsx           Cafe landing screen
-      record/
-        page.tsx                  Server component — fetches campaign
-        GuidedRecordingClient.tsx 5-clip guided recorder
     preview/page.tsx              Stitch + upload + preview
     reward/page.tsx               Reward code reveal
     globals.css

 # Matcha Moments — frontend
+Cafe-aesthetic, mobile-first PWA that walks a customer through one voice note
+and three guided food shots, then hands them a matcha redemption code.
 This repo is a **standalone Next.js app**. It calls Humeo's deployed public
 review APIs (`https://humeo.app/api/public/reviews/*`) — no backend changes
 ## Tech stack
 - **Next.js 14** (App Router) + TypeScript + Tailwind CSS
+- **Server-side ffmpeg** — renders the recorded voice note with the food shots before upload
 - **`getUserMedia` + `MediaRecorder`** — standard browser camera APIs (no native install required)
 - **`zod`** — shared validation schemas, mirrors the ones in Humeo's `src/lib/reviews/types.ts`
 1. `/` — QR landing context screen (dev-only; in prod, the cafe's QR deep-links straight to `/c/[slug]`)
 2. `/c/[slug]` — Cafe landing: brand, big "Free matcha, on the house" headline, consent copy, primary CTA
+3. `/c/[slug]/record` — Audio-first guided recorder (voice note → three food shots → auto-advance)
+4. `/preview` — server render fits the food shots to the full voice note, uploads to Humeo, polls submission status, shows the rendered preview
 5. `/reward` — Confetti, reward code in a dark card, "show this screen to your server"
 The `[slug]` route is a real Next.js dynamic segment that fetches its campaign
 GET  /c/[slug]            ──────►    GET  /api/public/reviews/campaign/[slug]
                           ◄──────    { id, slug, restaurantName, rulesConfig, ... }
+render clips on the server
 POST /preview submit      ──────►    POST /api/public/reviews/submit
                                        FormData: video, slug, consentAccepted,
 ---
+## Why server-side rendering?
+Humeo's `/api/public/reviews/submit` accepts a single video file. We want a
+guided multi-clip UX where the customer's voice note controls the final edit
+length. Server-side rendering lets us keep the full voice note and fit the food
+shots around it before submitting one finished video.
+Trade-offs:
+- Clips upload before preview, so the preview depends on server availability.
+- Server render cost replaces phone heat and browser-specific ffmpeg issues.
+- The voice note is the master duration; video is trimmed, reused, or lightly
+  held to keep every recorded word.
 ---
     c/[slug]/
       page.tsx                    Server component — fetches campaign
       LandingClient.tsx           Cafe landing screen
+      record/
+        page.tsx                  Server component — fetches campaign
+        GuidedRecordingClient.tsx Audio-first guided recorder
     preview/page.tsx              Stitch + upload + preview
     reward/page.tsx               Reward code reveal
     globals.css

src/app/api/public/reviews/submit-clips/route.ts CHANGED Viewed

@@ -43,8 +43,8 @@ export async function POST(req: NextRequest) {
     const deviceKey = sanitizeText(form.get('deviceKey'), 200) || null;
     const tableId = sanitizeText(form.get('tableId'), 80) || null;
-    const videoClips = collectFiles(form, 'videoClip', 1);
-    const audioClips = collectFiles(form, 'audioClip', 5);
     const totalBytes = [...videoClips, ...audioClips].reduce(
       (total, clip) => total + clip.file.size,
       0,

     const deviceKey = sanitizeText(form.get('deviceKey'), 200) || null;
     const tableId = sanitizeText(form.get('tableId'), 80) || null;
+    const videoClips = collectFiles(form, 'videoClip', 2);
+    const audioClips = collectFiles(form, 'audioClip', 1);
     const totalBytes = [...videoClips, ...audioClips].reduce(
       (total, clip) => total + clip.file.size,
       0,

src/app/c/[slug]/LandingClient.tsx CHANGED Viewed

@@ -55,8 +55,8 @@ export function LandingClient({ slug, tableId, campaign }: Props) {
           <em className="text-[#4A5C32]">on the house</em>
         </h1>
-        <p className="mx-auto mt-5 max-w-[280px] font-serif text-[14.5px] leading-[1.55] text-[#5A6E3F]">
-          You&apos;ll record 3 short video shots and short voice notes. Videos are 2-4 seconds; voice can be up to 8 seconds.
         </p>
         <label className="mx-auto mt-6 flex w-full max-w-[324px] cursor-pointer items-start gap-3 rounded-[14px] border border-[#78694B]/20 bg-[#F5EDD9] px-4 py-3.5 text-left shadow-[inset_0_1px_0_rgba(255,255,255,0.6)]">
@@ -82,7 +82,7 @@ export function LandingClient({ slug, tableId, campaign }: Props) {
               : 'bg-[#A4AC8C] font-serif text-[17px] text-[#F5EDD9] disabled:cursor-not-allowed disabled:opacity-95'
           }
         >
-          Get my matcha →
         </Button>
         {tableId ? (
           <p className="mt-3 text-center font-mono text-[10px] uppercase tracking-[0.15em] text-[#9A8E73]">

           <em className="text-[#4A5C32]">on the house</em>
         </h1>
+        <p className="mx-auto mt-5 max-w-[294px] font-serif text-[14.5px] leading-[1.55] text-[#5A6E3F]">
+          Say one quick voice note first. Then record 3 food shots, up to 10 seconds each. We&apos;ll fit the reel to your voice.
         </p>
         <label className="mx-auto mt-6 flex w-full max-w-[324px] cursor-pointer items-start gap-3 rounded-[14px] border border-[#78694B]/20 bg-[#F5EDD9] px-4 py-3.5 text-left shadow-[inset_0_1px_0_rgba(255,255,255,0.6)]">
               : 'bg-[#A4AC8C] font-serif text-[17px] text-[#F5EDD9] disabled:cursor-not-allowed disabled:opacity-95'
           }
         >
+          Get my matcha -&gt;
         </Button>
         {tableId ? (
           <p className="mt-3 text-center font-mono text-[10px] uppercase tracking-[0.15em] text-[#9A8E73]">

src/app/c/[slug]/record/GuidedRecordingClient.tsx CHANGED Viewed

@@ -389,7 +389,7 @@ function NativeVideoCapture({
               <Camera className="h-14 w-14 text-sage" />
             </button>
             <p className="mt-5 max-w-[280px] text-sm leading-6 text-white/70">
-              Use your phone camera for the cleanest clip. Aim for 2-4 seconds.
             </p>
             {error ? (
               <div className="mt-5 rounded-2xl bg-red-500/90 px-4 py-2 text-sm">

               <Camera className="h-14 w-14 text-sage" />
             </button>
             <p className="mt-5 max-w-[280px] text-sm leading-6 text-white/70">
+              Use your phone camera for the cleanest clip. Record up to {prompt.maxSeconds} seconds.
             </p>
             {error ? (
               <div className="mt-5 rounded-2xl bg-red-500/90 px-4 py-2 text-sm">

src/lib/ffmpeg.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * Client-side video concatenation using ffmpeg.wasm.
  *
  * Why client-side: Humeo's existing /api/public/reviews/submit endpoint accepts
- * a single video file. To avoid backend changes for v1, we stitch the 5 clips
  * in the browser before upload.
  *
  * Cost: ~8MB of WASM lazy-loaded after the customer finishes recording.

  * Client-side video concatenation using ffmpeg.wasm.
  *
  * Why client-side: Humeo's existing /api/public/reviews/submit endpoint accepts
+ * a single video file. To avoid backend changes for v1, this fallback stitches clips
  * in the browser before upload.
  *
  * Cost: ~8MB of WASM lazy-loaded after the customer finishes recording.

src/lib/server/reviewStore.ts CHANGED Viewed

@@ -94,67 +94,40 @@ const SAGE_AND_STONE: PublicReviewCampaign = {
   prompts: [
     {
       step: 1,
-      title: 'Close-up pan of the dish.',
-      tip: 'Move slowly across the texture, sauce, steam, and toppings.',
-      mediaType: 'video',
-      camera: 'rear',
-      maxSeconds: 4,
       optional: false,
     },
     {
       step: 2,
-      title: 'Wide shot of the meal.',
-      tip: 'Show the dish on the table.',
       mediaType: 'video',
       camera: 'rear',
-      maxSeconds: 4,
       optional: false,
     },
     {
       step: 3,
-      title: 'Action detail of the food.',
-      tip: 'Scoop, pour, lift, or stir the food.',
       mediaType: 'video',
       camera: 'rear',
-      maxSeconds: 4,
       optional: false,
     },
     {
       step: 4,
-      title: 'Reaction shot.',
-      tip: 'Optional: Take one bite or sip and react naturally. You may skip this step.',
       mediaType: 'video',
-      camera: 'front',
-      maxSeconds: 4,
-      optional: true,
-    },
-    {
-      step: 5,
-      title: "Voice: What's the dish?",
-      tip: 'Say the dish name. Describe what is on the plate.',
-      mediaType: 'audio',
-      camera: 'front',
-      maxSeconds: 8,
-      optional: false,
-    },
-    {
-      step: 6,
-      title: 'What did you like about it?',
-      tip: 'Say: Flavor, texture, portion, or what you liked.',
-      mediaType: 'audio',
-      camera: 'front',
-      maxSeconds: 8,
       optional: false,
     },
-    {
-      step: 7,
-      title: 'Voice: Recommendation',
-      tip: 'Optional: Say if you will recommend the dish and to whom.',
-      mediaType: 'audio',
-      camera: 'front',
-      maxSeconds: 8,
-      optional: true,
-    },
   ],
   rewardType: 'static_code',
   rewardValue: null,

   prompts: [
     {
       step: 1,
+      title: 'Tell us what you ordered.',
+      tip: 'Voice only. Say the dish name and what you liked about it.',
+      mediaType: 'audio',
+      camera: 'front',
+      maxSeconds: 10,
       optional: false,
     },
     {
       step: 2,
+      title: 'Close-up pan of the food.',
+      tip: 'Move slowly across the texture, sauce, steam, and toppings.',
       mediaType: 'video',
       camera: 'rear',
+      maxSeconds: 10,
       optional: false,
     },
     {
       step: 3,
+      title: 'Wide shot of the table.',
+      tip: 'Show the full plate, drink, table setup, and a little cafe vibe.',
       mediaType: 'video',
       camera: 'rear',
+      maxSeconds: 10,
       optional: false,
     },
     {
       step: 4,
+      title: 'Action detail of the food.',
+      tip: 'Cut, scoop, pour, lift, stir, or show the best bite.',
       mediaType: 'video',
+      camera: 'rear',
+      maxSeconds: 10,
       optional: false,
     },
   ],
   rewardType: 'static_code',
   rewardValue: null,

src/lib/server/serverClipRenderer.ts CHANGED Viewed

@@ -25,15 +25,15 @@ const WORK_DIR = path.join(process.cwd(), '.local-review-data', 'server-renders'
 const VIDEO_WIDTH = 1080;
 const VIDEO_HEIGHT = 1920;
 const VIDEO_FPS = 24;
-const MAX_VIDEO_CLIP_SECONDS = 4;
-const MAX_AUDIO_CLIP_SECONDS = 8;
 const FINAL_VIDEO_MAX_SECONDS = 17;
-const VOICEOVER_RENDER_SAFETY_MAX_SECONDS = 30;
 const STEP_VIDEO_MAX_SECONDS: Record<number, number> = {
-  1: 4,
-  2: 4,
-  3: 4,
-  4: 4,
 };
 type PreparedClip = {
@@ -238,52 +238,69 @@ async function probeHasAudio(filePath: string) {
   }
 }
-async function enforceShortestOutput(inputPath: string, outputPath: string) {
   const [videoDuration, audioDuration, formatDurationValue] = await Promise.all([
     probeStreamDuration(inputPath, 'v:0'),
     probeStreamDuration(inputPath, 'a:0'),
     probeFormatDuration(inputPath),
   ]);
-  if (
-    videoDuration <= 0 ||
-    ((audioDuration <= videoDuration + 0.08) && formatDurationValue <= videoDuration + 0.08)
-  ) {
     return {
       path: inputPath,
       durationSeconds: Math.max(videoDuration, formatDurationValue),
     };
   }
   await runCommand(
     'ffmpeg',
     [
       '-y',
       '-i',
       inputPath,
       '-map',
-      '0:v:0',
       '-map',
-      '0:a:0?',
       '-c:v',
-      'copy',
       '-c:a',
-      'aac',
-      '-b:a',
-      '96k',
       '-t',
-      formatDuration(videoDuration),
-      '-shortest',
       '-movflags',
       '+faststart',
       outputPath,
     ],
-    'Final duration trim',
   );
   return {
     path: outputPath,
-    durationSeconds: await probeFormatDuration(outputPath),
   };
 }
@@ -407,6 +424,49 @@ function buildLinearSegments(videoClips: PreparedClip[]) {
   return segments;
 }
 function addAudioSegment(
   segments: AudioSegment[],
   source: PreparedClip | null,
@@ -430,112 +490,24 @@ function buildVoiceAwarePlan({
 }): VoiceAwarePlan {
   if (videoClips.length === 0) return { videoSegments: [], audioSegments: [] };
-  const closeShot = clipByStep(videoClips, 1) ?? videoClips[0]!;
-  const wideShot = clipByStep(videoClips, 2) ?? videoClips[1] ?? closeShot;
-  const actionShot = clipByStep(videoClips, 3) ?? videoClips[2] ?? wideShot;
-  const reactionShot =
-    clipByStep(videoClips, 4) ??
-    videoClips.find((clip) => clip.step >= 6) ??
-    (videoClips.length > 3 ? videoClips[videoClips.length - 1]! : null);
-  const orderAudio = clipByStep(audioClips, 5) ?? clipByStep(audioClips, 4) ?? audioClips[0] ?? null;
-  const likedAudio =
-    clipByStep(audioClips, 6) ??
-    audioClips.find((clip) => clip !== orderAudio) ??
-    null;
-  const recommendationAudio =
-    clipByStep(audioClips, 7) ??
-    audioClips.find((clip) => clip !== orderAudio && clip !== likedAudio) ??
-    null;
   const targetSeconds = clamp(renderTargetSeconds, 1, VOICEOVER_RENDER_SAFETY_MAX_SECONDS);
-  const narrativeSeconds = Math.max(1, targetSeconds);
-  const desiredOrderSeconds = orderAudio
-    ? clamp(orderAudio.duration || 3, 0.5, narrativeSeconds)
-    : 0;
-  const desiredLikedSeconds = likedAudio
-    ? clamp(likedAudio.duration || 3, 0.5, narrativeSeconds)
-    : 0;
-  const desiredRecommendationSeconds = recommendationAudio
-    ? clamp(recommendationAudio.duration || 3, 0.5, narrativeSeconds)
-    : 0;
-  const desiredVoiceSeconds =
-    desiredOrderSeconds + desiredLikedSeconds + desiredRecommendationSeconds;
-  const voiceScale =
-    desiredVoiceSeconds > narrativeSeconds ? narrativeSeconds / desiredVoiceSeconds : 1;
-  const orderSeconds = desiredOrderSeconds * voiceScale;
-  const likedSeconds = desiredLikedSeconds * voiceScale;
-  const recommendationSeconds = desiredRecommendationSeconds * voiceScale;
-  const segments: VideoSegment[] = [];
   const audioSegments: AudioSegment[] = [];
-  const budgets = createClipBudgets(videoClips);
-  const foodSources = [closeShot, wideShot, actionShot, reactionShot];
-  let actualOrderSeconds = 0;
-  if (closeShot === wideShot || orderSeconds < 2.5) {
-    actualOrderSeconds += addSegmentWithBudget(segments, budgets, closeShot, orderSeconds);
-  } else {
-    const closeSeconds = clamp(orderSeconds * 0.58, 1.25, orderSeconds - 0.75);
-    actualOrderSeconds += addSegmentWithBudget(segments, budgets, closeShot, closeSeconds);
-    actualOrderSeconds += addSegmentWithBudget(segments, budgets, wideShot, orderSeconds - closeSeconds);
   }
-  actualOrderSeconds += addRotatingSegments(
-    segments,
-    [closeShot, wideShot, actionShot, reactionShot],
-    orderSeconds - actualOrderSeconds,
-  );
-  addAudioSegment(audioSegments, orderAudio, orderSeconds);
-  let actualLikedSeconds = addSegmentWithBudget(segments, budgets, actionShot, likedSeconds);
-  actualLikedSeconds += addRotatingSegments(
-    segments,
-    [actionShot, closeShot, wideShot, reactionShot],
-    likedSeconds - actualLikedSeconds,
-  );
-  addAudioSegment(audioSegments, likedAudio, likedSeconds);
-  let remainingRecommendationSeconds = recommendationSeconds;
-  let actualRecommendationSeconds = 0;
-  actualRecommendationSeconds += addSegmentWithBudget(
-    segments,
-    budgets,
-    reactionShot,
-    remainingRecommendationSeconds,
-  );
-  remainingRecommendationSeconds -= actualRecommendationSeconds;
-  const wideRecommendationSeconds = addSegmentWithBudget(
-    segments,
-    budgets,
-    wideShot,
-    remainingRecommendationSeconds,
-  );
-  actualRecommendationSeconds += wideRecommendationSeconds;
-  remainingRecommendationSeconds -= wideRecommendationSeconds;
-  const actionRecommendationSeconds = addSegmentWithBudget(
-    segments,
-    budgets,
-    actionShot,
-    remainingRecommendationSeconds,
-  );
-  actualRecommendationSeconds += actionRecommendationSeconds;
-  remainingRecommendationSeconds -= actionRecommendationSeconds;
-  actualRecommendationSeconds += addSegmentWithBudget(
-    segments,
-    budgets,
-    closeShot,
-    remainingRecommendationSeconds,
-  );
-  actualRecommendationSeconds += addRotatingSegments(
-    segments,
-    [reactionShot, wideShot, actionShot, closeShot, ...foodSources],
-    recommendationSeconds - actualRecommendationSeconds,
-  );
-  addAudioSegment(audioSegments, recommendationAudio, recommendationSeconds);
   return {
-    videoSegments: segments.length > 0 ? segments : buildLinearSegments(videoClips),
     audioSegments,
   };
 }
@@ -763,7 +735,6 @@ export async function renderClipsOnServer(input: ServerRenderInput): Promise<Ser
         '96k',
         '-t',
         formatDuration(renderTargetSeconds),
-        '-shortest',
         '-movflags',
         '+faststart',
         '-avoid_negative_ts',
@@ -773,7 +744,7 @@ export async function renderClipsOnServer(input: ServerRenderInput): Promise<Ser
       'Server clip render',
     );
-    const finalOutput = await enforceShortestOutput(outputPath, fixedOutputPath);
     const bytes = await readFile(finalOutput.path);
     if (bytes.length <= 0) throw new Error('Server render produced an empty video.');

 const VIDEO_WIDTH = 1080;
 const VIDEO_HEIGHT = 1920;
 const VIDEO_FPS = 24;
+const MAX_VIDEO_CLIP_SECONDS = 10;
+const MAX_AUDIO_CLIP_SECONDS = 10;
 const FINAL_VIDEO_MAX_SECONDS = 17;
+const VOICEOVER_RENDER_SAFETY_MAX_SECONDS = 60;
 const STEP_VIDEO_MAX_SECONDS: Record<number, number> = {
+  1: 10,
+  2: 10,
+  3: 10,
+  4: 10,
 };
 type PreparedClip = {
   }
 }
+async function enforceAudioMasterOutput(inputPath: string, outputPath: string) {
   const [videoDuration, audioDuration, formatDurationValue] = await Promise.all([
     probeStreamDuration(inputPath, 'v:0'),
     probeStreamDuration(inputPath, 'a:0'),
     probeFormatDuration(inputPath),
   ]);
+  if (audioDuration <= 0 || videoDuration <= 0) {
     return {
       path: inputPath,
       durationSeconds: Math.max(videoDuration, formatDurationValue),
     };
   }
+  if (videoDuration + 0.08 >= audioDuration && formatDurationValue <= audioDuration + 0.2) {
+    return {
+      path: inputPath,
+      durationSeconds: audioDuration,
+    };
+  }
+  const videoPadSeconds = Math.max(0, audioDuration - videoDuration);
+  const videoFilter =
+    videoPadSeconds > 0.08
+      ? `[0:v]tpad=stop_mode=clone:stop_duration=${formatDuration(
+          videoPadSeconds,
+        )},trim=duration=${formatDuration(audioDuration)},setpts=PTS-STARTPTS[v]`
+      : `[0:v]trim=duration=${formatDuration(audioDuration)},setpts=PTS-STARTPTS[v]`;
   await runCommand(
     'ffmpeg',
     [
       '-y',
       '-i',
       inputPath,
+      '-filter_complex',
+      videoFilter,
       '-map',
+      '[v]',
       '-map',
+      '0:a:0',
       '-c:v',
+      'libx264',
+      '-preset',
+      'ultrafast',
+      '-crf',
+      '30',
+      '-r',
+      String(VIDEO_FPS),
       '-c:a',
+      'copy',
       '-t',
+      formatDuration(audioDuration),
       '-movflags',
       '+faststart',
       outputPath,
     ],
+    'Final audio-master duration fix',
   );
   return {
     path: outputPath,
+    durationSeconds: audioDuration,
   };
 }
   return segments;
 }
+function sortByStep(clips: PreparedClip[]) {
+  return [...clips].sort((a, b) => a.step - b.step);
+}
+function buildBalancedVideoSegments(videoClips: PreparedClip[], targetSeconds: number) {
+  const orderedClips = sortByStep(videoClips);
+  const segments: VideoSegment[] = [];
+  const budgets = createClipBudgets(orderedClips);
+  let remaining = targetSeconds;
+  let activeSources = orderedClips.filter((clip) => usableClipDuration(clip) >= 0.25);
+  let guard = 0;
+  while (remaining >= 0.25 && activeSources.length > 0 && guard < 20) {
+    guard += 1;
+    const share = remaining / activeSources.length;
+    let addedThisRound = 0;
+    for (const source of activeSources) {
+      if (remaining < 0.25) break;
+      const remainingBudget = budgets.get(source.path) ?? usableClipDuration(source);
+      if (remainingBudget < 0.25) continue;
+      const requestedSeconds = Math.min(share, remainingBudget, remaining);
+      const added = addSegmentWithBudget(segments, budgets, source, requestedSeconds);
+      addedThisRound += added;
+      remaining -= added;
+    }
+    if (addedThisRound < 0.25) break;
+    activeSources = activeSources.filter((source) => {
+      const remainingBudget = budgets.get(source.path) ?? 0;
+      return remainingBudget >= 0.25;
+    });
+  }
+  if (remaining >= 0.25) {
+    addRotatingSegments(segments, orderedClips, remaining);
+  }
+  return segments;
+}
 function addAudioSegment(
   segments: AudioSegment[],
   source: PreparedClip | null,
 }): VoiceAwarePlan {
   if (videoClips.length === 0) return { videoSegments: [], audioSegments: [] };
   const targetSeconds = clamp(renderTargetSeconds, 1, VOICEOVER_RENDER_SAFETY_MAX_SECONDS);
   const audioSegments: AudioSegment[] = [];
+  let plannedAudioSeconds = 0;
+  for (const audioClip of sortByStep(audioClips)) {
+    const remainingTarget = Math.max(0, targetSeconds - plannedAudioSeconds);
+    const duration =
+      audioClip.duration > 0
+        ? audioClip.duration
+        : Math.min(MAX_AUDIO_CLIP_SECONDS, remainingTarget || MAX_AUDIO_CLIP_SECONDS);
+    addAudioSegment(audioSegments, audioClip, duration);
+    plannedAudioSeconds += duration;
   }
+  const videoSegments = buildBalancedVideoSegments(videoClips, targetSeconds);
   return {
+    videoSegments: videoSegments.length > 0 ? videoSegments : buildLinearSegments(videoClips),
     audioSegments,
   };
 }
         '96k',
         '-t',
         formatDuration(renderTargetSeconds),
         '-movflags',
         '+faststart',
         '-avoid_negative_ts',
       'Server clip render',
     );
+    const finalOutput = await enforceAudioMasterOutput(outputPath, fixedOutputPath);
     const bytes = await readFile(finalOutput.path);
     if (bytes.length <= 0) throw new Error('Server render produced an empty video.');