moonlantern1 commited on
Commit
089c9df
Β·
1 Parent(s): bf00623

Make classic flow audio first

Browse files
README.md CHANGED
@@ -10,8 +10,8 @@ pinned: false
10
 
11
  # Matcha Moments β€” frontend
12
 
13
- Cafe-aesthetic, mobile-first PWA that walks a customer through a 5-clip guided
14
- video review and, on submit, hands them a matcha redemption code.
15
 
16
  This repo is a **standalone Next.js app**. It calls Humeo's deployed public
17
  review APIs (`https://humeo.app/api/public/reviews/*`) β€” no backend changes
@@ -25,7 +25,7 @@ It's gitignored, so it never ships with this repo.
25
  ## Tech stack
26
 
27
  - **Next.js 14** (App Router) + TypeScript + Tailwind CSS
28
- - **`@ffmpeg/ffmpeg`** (ffmpeg.wasm) β€” client-side concatenation of the 5 recorded clips into one video before upload
29
  - **`getUserMedia` + `MediaRecorder`** β€” standard browser camera APIs (no native install required)
30
  - **`zod`** β€” shared validation schemas, mirrors the ones in Humeo's `src/lib/reviews/types.ts`
31
 
@@ -39,8 +39,8 @@ flow opens in 2 seconds, no install, works on iOS Safari and Android Chrome.
39
 
40
  1. `/` β€” QR landing context screen (dev-only; in prod, the cafe's QR deep-links straight to `/c/[slug]`)
41
  2. `/c/[slug]` β€” Cafe landing: brand, big "Free matcha, on the house" headline, consent copy, primary CTA
42
- 3. `/c/[slug]/record` β€” Guided 5-clip recorder (video preview β†’ prompt card β†’ record button β†’ auto-advance)
43
- 4. `/preview` β€” ffmpeg.wasm stitches the clips, uploads to Humeo, polls submission status, shows the rendered preview
44
  5. `/reward` β€” Confetti, reward code in a dark card, "show this screen to your server"
45
 
46
  The `[slug]` route is a real Next.js dynamic segment that fetches its campaign
@@ -80,7 +80,7 @@ matcha-moments PWA Humeo backend (deployed)
80
  GET /c/[slug] ──────► GET /api/public/reviews/campaign/[slug]
81
  ◄────── { id, slug, restaurantName, rulesConfig, ... }
82
 
83
- stitch clips locally (ffmpeg.wasm)
84
 
85
  POST /preview submit ──────► POST /api/public/reviews/submit
86
  FormData: video, slug, consentAccepted,
@@ -107,21 +107,18 @@ Until then the matcha-moments app silently injects the cafe defaults.
107
 
108
  ---
109
 
110
- ## Why client-side ffmpeg.wasm?
111
 
112
- Humeo's `/api/public/reviews/submit` accepts a single video file. We want a
113
- multi-clip guided UX without forking Humeo's submit flow. Stitching the 5
114
- recordings in the browser solves that with zero backend changes.
 
115
 
116
- Trade-offs:
117
- - 8MB WASM download, lazy-loaded only after the customer finishes recording
118
- - 3-6 seconds of stitch time on a modern phone for ~50s of total video
119
- - `next.config.js` sets COOP/COEP headers (required for `SharedArrayBuffer`)
120
-
121
- If cafe staff start hearing complaints about phone heat, swap to a
122
- multi-clip upload + server-side ffmpeg endpoint. Humeo's worker
123
- (`reference/src/lib/server/processInterview.ts`) already uses ffmpeg, so the
124
- migration is mostly a new submit endpoint.
125
 
126
  ---
127
 
@@ -135,9 +132,9 @@ src/
135
  c/[slug]/
136
  page.tsx Server component β€” fetches campaign
137
  LandingClient.tsx Cafe landing screen
138
- record/
139
- page.tsx Server component β€” fetches campaign
140
- GuidedRecordingClient.tsx 5-clip guided recorder
141
  preview/page.tsx Stitch + upload + preview
142
  reward/page.tsx Reward code reveal
143
  globals.css
 
10
 
11
  # Matcha Moments β€” frontend
12
 
13
+ Cafe-aesthetic, mobile-first PWA that walks a customer through one voice note
14
+ and three guided food shots, then hands them a matcha redemption code.
15
 
16
  This repo is a **standalone Next.js app**. It calls Humeo's deployed public
17
  review APIs (`https://humeo.app/api/public/reviews/*`) β€” no backend changes
 
25
  ## Tech stack
26
 
27
  - **Next.js 14** (App Router) + TypeScript + Tailwind CSS
28
+ - **Server-side ffmpeg** β€” renders the recorded voice note with the food shots before upload
29
  - **`getUserMedia` + `MediaRecorder`** β€” standard browser camera APIs (no native install required)
30
  - **`zod`** β€” shared validation schemas, mirrors the ones in Humeo's `src/lib/reviews/types.ts`
31
 
 
39
 
40
  1. `/` β€” QR landing context screen (dev-only; in prod, the cafe's QR deep-links straight to `/c/[slug]`)
41
  2. `/c/[slug]` β€” Cafe landing: brand, big "Free matcha, on the house" headline, consent copy, primary CTA
42
+ 3. `/c/[slug]/record` β€” Audio-first guided recorder (voice note β†’ three food shots β†’ auto-advance)
43
+ 4. `/preview` β€” server render fits the food shots to the full voice note, uploads to Humeo, polls submission status, shows the rendered preview
44
  5. `/reward` β€” Confetti, reward code in a dark card, "show this screen to your server"
45
 
46
  The `[slug]` route is a real Next.js dynamic segment that fetches its campaign
 
80
  GET /c/[slug] ──────► GET /api/public/reviews/campaign/[slug]
81
  ◄────── { id, slug, restaurantName, rulesConfig, ... }
82
 
83
+ render clips on the server
84
 
85
  POST /preview submit ──────► POST /api/public/reviews/submit
86
  FormData: video, slug, consentAccepted,
 
107
 
108
  ---
109
 
110
+ ## Why server-side rendering?
111
 
112
+ Humeo's `/api/public/reviews/submit` accepts a single video file. We want a
113
+ guided multi-clip UX where the customer's voice note controls the final edit
114
+ length. Server-side rendering lets us keep the full voice note and fit the food
115
+ shots around it before submitting one finished video.
116
 
117
+ Trade-offs:
118
+ - Clips upload before preview, so the preview depends on server availability.
119
+ - Server render cost replaces phone heat and browser-specific ffmpeg issues.
120
+ - The voice note is the master duration; video is trimmed, reused, or lightly
121
+ held to keep every recorded word.
 
 
 
 
122
 
123
  ---
124
 
 
132
  c/[slug]/
133
  page.tsx Server component β€” fetches campaign
134
  LandingClient.tsx Cafe landing screen
135
+ record/
136
+ page.tsx Server component β€” fetches campaign
137
+ GuidedRecordingClient.tsx Audio-first guided recorder
138
  preview/page.tsx Stitch + upload + preview
139
  reward/page.tsx Reward code reveal
140
  globals.css
src/app/api/public/reviews/submit-clips/route.ts CHANGED
@@ -43,8 +43,8 @@ export async function POST(req: NextRequest) {
43
  const deviceKey = sanitizeText(form.get('deviceKey'), 200) || null;
44
  const tableId = sanitizeText(form.get('tableId'), 80) || null;
45
 
46
- const videoClips = collectFiles(form, 'videoClip', 1);
47
- const audioClips = collectFiles(form, 'audioClip', 5);
48
  const totalBytes = [...videoClips, ...audioClips].reduce(
49
  (total, clip) => total + clip.file.size,
50
  0,
 
43
  const deviceKey = sanitizeText(form.get('deviceKey'), 200) || null;
44
  const tableId = sanitizeText(form.get('tableId'), 80) || null;
45
 
46
+ const videoClips = collectFiles(form, 'videoClip', 2);
47
+ const audioClips = collectFiles(form, 'audioClip', 1);
48
  const totalBytes = [...videoClips, ...audioClips].reduce(
49
  (total, clip) => total + clip.file.size,
50
  0,
src/app/c/[slug]/LandingClient.tsx CHANGED
@@ -55,8 +55,8 @@ export function LandingClient({ slug, tableId, campaign }: Props) {
55
  <em className="text-[#4A5C32]">on the house</em>
56
  </h1>
57
 
58
- <p className="mx-auto mt-5 max-w-[280px] font-serif text-[14.5px] leading-[1.55] text-[#5A6E3F]">
59
- You&apos;ll record 3 short video shots and short voice notes. Videos are 2-4 seconds; voice can be up to 8 seconds.
60
  </p>
61
 
62
  <label className="mx-auto mt-6 flex w-full max-w-[324px] cursor-pointer items-start gap-3 rounded-[14px] border border-[#78694B]/20 bg-[#F5EDD9] px-4 py-3.5 text-left shadow-[inset_0_1px_0_rgba(255,255,255,0.6)]">
@@ -82,7 +82,7 @@ export function LandingClient({ slug, tableId, campaign }: Props) {
82
  : 'bg-[#A4AC8C] font-serif text-[17px] text-[#F5EDD9] disabled:cursor-not-allowed disabled:opacity-95'
83
  }
84
  >
85
- Get my matcha β†’
86
  </Button>
87
  {tableId ? (
88
  <p className="mt-3 text-center font-mono text-[10px] uppercase tracking-[0.15em] text-[#9A8E73]">
 
55
  <em className="text-[#4A5C32]">on the house</em>
56
  </h1>
57
 
58
+ <p className="mx-auto mt-5 max-w-[294px] font-serif text-[14.5px] leading-[1.55] text-[#5A6E3F]">
59
+ Say one quick voice note first. Then record 3 food shots, up to 10 seconds each. We&apos;ll fit the reel to your voice.
60
  </p>
61
 
62
  <label className="mx-auto mt-6 flex w-full max-w-[324px] cursor-pointer items-start gap-3 rounded-[14px] border border-[#78694B]/20 bg-[#F5EDD9] px-4 py-3.5 text-left shadow-[inset_0_1px_0_rgba(255,255,255,0.6)]">
 
82
  : 'bg-[#A4AC8C] font-serif text-[17px] text-[#F5EDD9] disabled:cursor-not-allowed disabled:opacity-95'
83
  }
84
  >
85
+ Get my matcha -&gt;
86
  </Button>
87
  {tableId ? (
88
  <p className="mt-3 text-center font-mono text-[10px] uppercase tracking-[0.15em] text-[#9A8E73]">
src/app/c/[slug]/record/GuidedRecordingClient.tsx CHANGED
@@ -389,7 +389,7 @@ function NativeVideoCapture({
389
  <Camera className="h-14 w-14 text-sage" />
390
  </button>
391
  <p className="mt-5 max-w-[280px] text-sm leading-6 text-white/70">
392
- Use your phone camera for the cleanest clip. Aim for 2-4 seconds.
393
  </p>
394
  {error ? (
395
  <div className="mt-5 rounded-2xl bg-red-500/90 px-4 py-2 text-sm">
 
389
  <Camera className="h-14 w-14 text-sage" />
390
  </button>
391
  <p className="mt-5 max-w-[280px] text-sm leading-6 text-white/70">
392
+ Use your phone camera for the cleanest clip. Record up to {prompt.maxSeconds} seconds.
393
  </p>
394
  {error ? (
395
  <div className="mt-5 rounded-2xl bg-red-500/90 px-4 py-2 text-sm">
src/lib/ffmpeg.ts CHANGED
@@ -2,7 +2,7 @@
2
  * Client-side video concatenation using ffmpeg.wasm.
3
  *
4
  * Why client-side: Humeo's existing /api/public/reviews/submit endpoint accepts
5
- * a single video file. To avoid backend changes for v1, we stitch the 5 clips
6
  * in the browser before upload.
7
  *
8
  * Cost: ~8MB of WASM lazy-loaded after the customer finishes recording.
 
2
  * Client-side video concatenation using ffmpeg.wasm.
3
  *
4
  * Why client-side: Humeo's existing /api/public/reviews/submit endpoint accepts
5
+ * a single video file. To avoid backend changes for v1, this fallback stitches clips
6
  * in the browser before upload.
7
  *
8
  * Cost: ~8MB of WASM lazy-loaded after the customer finishes recording.
src/lib/server/reviewStore.ts CHANGED
@@ -94,67 +94,40 @@ const SAGE_AND_STONE: PublicReviewCampaign = {
94
  prompts: [
95
  {
96
  step: 1,
97
- title: 'Close-up pan of the dish.',
98
- tip: 'Move slowly across the texture, sauce, steam, and toppings.',
99
- mediaType: 'video',
100
- camera: 'rear',
101
- maxSeconds: 4,
102
  optional: false,
103
  },
104
  {
105
  step: 2,
106
- title: 'Wide shot of the meal.',
107
- tip: 'Show the dish on the table.',
108
  mediaType: 'video',
109
  camera: 'rear',
110
- maxSeconds: 4,
111
  optional: false,
112
  },
113
  {
114
  step: 3,
115
- title: 'Action detail of the food.',
116
- tip: 'Scoop, pour, lift, or stir the food.',
117
  mediaType: 'video',
118
  camera: 'rear',
119
- maxSeconds: 4,
120
  optional: false,
121
  },
122
  {
123
  step: 4,
124
- title: 'Reaction shot.',
125
- tip: 'Optional: Take one bite or sip and react naturally. You may skip this step.',
126
  mediaType: 'video',
127
- camera: 'front',
128
- maxSeconds: 4,
129
- optional: true,
130
- },
131
- {
132
- step: 5,
133
- title: "Voice: What's the dish?",
134
- tip: 'Say the dish name. Describe what is on the plate.',
135
- mediaType: 'audio',
136
- camera: 'front',
137
- maxSeconds: 8,
138
- optional: false,
139
- },
140
- {
141
- step: 6,
142
- title: 'What did you like about it?',
143
- tip: 'Say: Flavor, texture, portion, or what you liked.',
144
- mediaType: 'audio',
145
- camera: 'front',
146
- maxSeconds: 8,
147
  optional: false,
148
  },
149
- {
150
- step: 7,
151
- title: 'Voice: Recommendation',
152
- tip: 'Optional: Say if you will recommend the dish and to whom.',
153
- mediaType: 'audio',
154
- camera: 'front',
155
- maxSeconds: 8,
156
- optional: true,
157
- },
158
  ],
159
  rewardType: 'static_code',
160
  rewardValue: null,
 
94
  prompts: [
95
  {
96
  step: 1,
97
+ title: 'Tell us what you ordered.',
98
+ tip: 'Voice only. Say the dish name and what you liked about it.',
99
+ mediaType: 'audio',
100
+ camera: 'front',
101
+ maxSeconds: 10,
102
  optional: false,
103
  },
104
  {
105
  step: 2,
106
+ title: 'Close-up pan of the food.',
107
+ tip: 'Move slowly across the texture, sauce, steam, and toppings.',
108
  mediaType: 'video',
109
  camera: 'rear',
110
+ maxSeconds: 10,
111
  optional: false,
112
  },
113
  {
114
  step: 3,
115
+ title: 'Wide shot of the table.',
116
+ tip: 'Show the full plate, drink, table setup, and a little cafe vibe.',
117
  mediaType: 'video',
118
  camera: 'rear',
119
+ maxSeconds: 10,
120
  optional: false,
121
  },
122
  {
123
  step: 4,
124
+ title: 'Action detail of the food.',
125
+ tip: 'Cut, scoop, pour, lift, stir, or show the best bite.',
126
  mediaType: 'video',
127
+ camera: 'rear',
128
+ maxSeconds: 10,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  optional: false,
130
  },
 
 
 
 
 
 
 
 
 
131
  ],
132
  rewardType: 'static_code',
133
  rewardValue: null,
src/lib/server/serverClipRenderer.ts CHANGED
@@ -25,15 +25,15 @@ const WORK_DIR = path.join(process.cwd(), '.local-review-data', 'server-renders'
25
  const VIDEO_WIDTH = 1080;
26
  const VIDEO_HEIGHT = 1920;
27
  const VIDEO_FPS = 24;
28
- const MAX_VIDEO_CLIP_SECONDS = 4;
29
- const MAX_AUDIO_CLIP_SECONDS = 8;
30
  const FINAL_VIDEO_MAX_SECONDS = 17;
31
- const VOICEOVER_RENDER_SAFETY_MAX_SECONDS = 30;
32
  const STEP_VIDEO_MAX_SECONDS: Record<number, number> = {
33
- 1: 4,
34
- 2: 4,
35
- 3: 4,
36
- 4: 4,
37
  };
38
 
39
  type PreparedClip = {
@@ -238,52 +238,69 @@ async function probeHasAudio(filePath: string) {
238
  }
239
  }
240
 
241
- async function enforceShortestOutput(inputPath: string, outputPath: string) {
242
  const [videoDuration, audioDuration, formatDurationValue] = await Promise.all([
243
  probeStreamDuration(inputPath, 'v:0'),
244
  probeStreamDuration(inputPath, 'a:0'),
245
  probeFormatDuration(inputPath),
246
  ]);
247
 
248
- if (
249
- videoDuration <= 0 ||
250
- ((audioDuration <= videoDuration + 0.08) && formatDurationValue <= videoDuration + 0.08)
251
- ) {
252
  return {
253
  path: inputPath,
254
  durationSeconds: Math.max(videoDuration, formatDurationValue),
255
  };
256
  }
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  await runCommand(
259
  'ffmpeg',
260
  [
261
  '-y',
262
  '-i',
263
  inputPath,
 
 
264
  '-map',
265
- '0:v:0',
266
  '-map',
267
- '0:a:0?',
268
  '-c:v',
269
- 'copy',
 
 
 
 
 
 
270
  '-c:a',
271
- 'aac',
272
- '-b:a',
273
- '96k',
274
  '-t',
275
- formatDuration(videoDuration),
276
- '-shortest',
277
  '-movflags',
278
  '+faststart',
279
  outputPath,
280
  ],
281
- 'Final duration trim',
282
  );
283
 
284
  return {
285
  path: outputPath,
286
- durationSeconds: await probeFormatDuration(outputPath),
287
  };
288
  }
289
 
@@ -407,6 +424,49 @@ function buildLinearSegments(videoClips: PreparedClip[]) {
407
  return segments;
408
  }
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  function addAudioSegment(
411
  segments: AudioSegment[],
412
  source: PreparedClip | null,
@@ -430,112 +490,24 @@ function buildVoiceAwarePlan({
430
  }): VoiceAwarePlan {
431
  if (videoClips.length === 0) return { videoSegments: [], audioSegments: [] };
432
 
433
- const closeShot = clipByStep(videoClips, 1) ?? videoClips[0]!;
434
- const wideShot = clipByStep(videoClips, 2) ?? videoClips[1] ?? closeShot;
435
- const actionShot = clipByStep(videoClips, 3) ?? videoClips[2] ?? wideShot;
436
- const reactionShot =
437
- clipByStep(videoClips, 4) ??
438
- videoClips.find((clip) => clip.step >= 6) ??
439
- (videoClips.length > 3 ? videoClips[videoClips.length - 1]! : null);
440
-
441
- const orderAudio = clipByStep(audioClips, 5) ?? clipByStep(audioClips, 4) ?? audioClips[0] ?? null;
442
- const likedAudio =
443
- clipByStep(audioClips, 6) ??
444
- audioClips.find((clip) => clip !== orderAudio) ??
445
- null;
446
- const recommendationAudio =
447
- clipByStep(audioClips, 7) ??
448
- audioClips.find((clip) => clip !== orderAudio && clip !== likedAudio) ??
449
- null;
450
-
451
  const targetSeconds = clamp(renderTargetSeconds, 1, VOICEOVER_RENDER_SAFETY_MAX_SECONDS);
452
- const narrativeSeconds = Math.max(1, targetSeconds);
453
-
454
- const desiredOrderSeconds = orderAudio
455
- ? clamp(orderAudio.duration || 3, 0.5, narrativeSeconds)
456
- : 0;
457
- const desiredLikedSeconds = likedAudio
458
- ? clamp(likedAudio.duration || 3, 0.5, narrativeSeconds)
459
- : 0;
460
- const desiredRecommendationSeconds = recommendationAudio
461
- ? clamp(recommendationAudio.duration || 3, 0.5, narrativeSeconds)
462
- : 0;
463
- const desiredVoiceSeconds =
464
- desiredOrderSeconds + desiredLikedSeconds + desiredRecommendationSeconds;
465
- const voiceScale =
466
- desiredVoiceSeconds > narrativeSeconds ? narrativeSeconds / desiredVoiceSeconds : 1;
467
- const orderSeconds = desiredOrderSeconds * voiceScale;
468
- const likedSeconds = desiredLikedSeconds * voiceScale;
469
- const recommendationSeconds = desiredRecommendationSeconds * voiceScale;
470
-
471
- const segments: VideoSegment[] = [];
472
  const audioSegments: AudioSegment[] = [];
473
- const budgets = createClipBudgets(videoClips);
474
- const foodSources = [closeShot, wideShot, actionShot, reactionShot];
475
-
476
- let actualOrderSeconds = 0;
477
- if (closeShot === wideShot || orderSeconds < 2.5) {
478
- actualOrderSeconds += addSegmentWithBudget(segments, budgets, closeShot, orderSeconds);
479
- } else {
480
- const closeSeconds = clamp(orderSeconds * 0.58, 1.25, orderSeconds - 0.75);
481
- actualOrderSeconds += addSegmentWithBudget(segments, budgets, closeShot, closeSeconds);
482
- actualOrderSeconds += addSegmentWithBudget(segments, budgets, wideShot, orderSeconds - closeSeconds);
483
  }
484
- actualOrderSeconds += addRotatingSegments(
485
- segments,
486
- [closeShot, wideShot, actionShot, reactionShot],
487
- orderSeconds - actualOrderSeconds,
488
- );
489
- addAudioSegment(audioSegments, orderAudio, orderSeconds);
490
 
491
- let actualLikedSeconds = addSegmentWithBudget(segments, budgets, actionShot, likedSeconds);
492
- actualLikedSeconds += addRotatingSegments(
493
- segments,
494
- [actionShot, closeShot, wideShot, reactionShot],
495
- likedSeconds - actualLikedSeconds,
496
- );
497
- addAudioSegment(audioSegments, likedAudio, likedSeconds);
498
-
499
- let remainingRecommendationSeconds = recommendationSeconds;
500
- let actualRecommendationSeconds = 0;
501
- actualRecommendationSeconds += addSegmentWithBudget(
502
- segments,
503
- budgets,
504
- reactionShot,
505
- remainingRecommendationSeconds,
506
- );
507
- remainingRecommendationSeconds -= actualRecommendationSeconds;
508
- const wideRecommendationSeconds = addSegmentWithBudget(
509
- segments,
510
- budgets,
511
- wideShot,
512
- remainingRecommendationSeconds,
513
- );
514
- actualRecommendationSeconds += wideRecommendationSeconds;
515
- remainingRecommendationSeconds -= wideRecommendationSeconds;
516
- const actionRecommendationSeconds = addSegmentWithBudget(
517
- segments,
518
- budgets,
519
- actionShot,
520
- remainingRecommendationSeconds,
521
- );
522
- actualRecommendationSeconds += actionRecommendationSeconds;
523
- remainingRecommendationSeconds -= actionRecommendationSeconds;
524
- actualRecommendationSeconds += addSegmentWithBudget(
525
- segments,
526
- budgets,
527
- closeShot,
528
- remainingRecommendationSeconds,
529
- );
530
- actualRecommendationSeconds += addRotatingSegments(
531
- segments,
532
- [reactionShot, wideShot, actionShot, closeShot, ...foodSources],
533
- recommendationSeconds - actualRecommendationSeconds,
534
- );
535
- addAudioSegment(audioSegments, recommendationAudio, recommendationSeconds);
536
 
537
  return {
538
- videoSegments: segments.length > 0 ? segments : buildLinearSegments(videoClips),
539
  audioSegments,
540
  };
541
  }
@@ -763,7 +735,6 @@ export async function renderClipsOnServer(input: ServerRenderInput): Promise<Ser
763
  '96k',
764
  '-t',
765
  formatDuration(renderTargetSeconds),
766
- '-shortest',
767
  '-movflags',
768
  '+faststart',
769
  '-avoid_negative_ts',
@@ -773,7 +744,7 @@ export async function renderClipsOnServer(input: ServerRenderInput): Promise<Ser
773
  'Server clip render',
774
  );
775
 
776
- const finalOutput = await enforceShortestOutput(outputPath, fixedOutputPath);
777
  const bytes = await readFile(finalOutput.path);
778
  if (bytes.length <= 0) throw new Error('Server render produced an empty video.');
779
 
 
25
  const VIDEO_WIDTH = 1080;
26
  const VIDEO_HEIGHT = 1920;
27
  const VIDEO_FPS = 24;
28
+ const MAX_VIDEO_CLIP_SECONDS = 10;
29
+ const MAX_AUDIO_CLIP_SECONDS = 10;
30
  const FINAL_VIDEO_MAX_SECONDS = 17;
31
+ const VOICEOVER_RENDER_SAFETY_MAX_SECONDS = 60;
32
  const STEP_VIDEO_MAX_SECONDS: Record<number, number> = {
33
+ 1: 10,
34
+ 2: 10,
35
+ 3: 10,
36
+ 4: 10,
37
  };
38
 
39
  type PreparedClip = {
 
238
  }
239
  }
240
 
241
+ async function enforceAudioMasterOutput(inputPath: string, outputPath: string) {
242
  const [videoDuration, audioDuration, formatDurationValue] = await Promise.all([
243
  probeStreamDuration(inputPath, 'v:0'),
244
  probeStreamDuration(inputPath, 'a:0'),
245
  probeFormatDuration(inputPath),
246
  ]);
247
 
248
+ if (audioDuration <= 0 || videoDuration <= 0) {
 
 
 
249
  return {
250
  path: inputPath,
251
  durationSeconds: Math.max(videoDuration, formatDurationValue),
252
  };
253
  }
254
 
255
+ if (videoDuration + 0.08 >= audioDuration && formatDurationValue <= audioDuration + 0.2) {
256
+ return {
257
+ path: inputPath,
258
+ durationSeconds: audioDuration,
259
+ };
260
+ }
261
+
262
+ const videoPadSeconds = Math.max(0, audioDuration - videoDuration);
263
+ const videoFilter =
264
+ videoPadSeconds > 0.08
265
+ ? `[0:v]tpad=stop_mode=clone:stop_duration=${formatDuration(
266
+ videoPadSeconds,
267
+ )},trim=duration=${formatDuration(audioDuration)},setpts=PTS-STARTPTS[v]`
268
+ : `[0:v]trim=duration=${formatDuration(audioDuration)},setpts=PTS-STARTPTS[v]`;
269
+
270
  await runCommand(
271
  'ffmpeg',
272
  [
273
  '-y',
274
  '-i',
275
  inputPath,
276
+ '-filter_complex',
277
+ videoFilter,
278
  '-map',
279
+ '[v]',
280
  '-map',
281
+ '0:a:0',
282
  '-c:v',
283
+ 'libx264',
284
+ '-preset',
285
+ 'ultrafast',
286
+ '-crf',
287
+ '30',
288
+ '-r',
289
+ String(VIDEO_FPS),
290
  '-c:a',
291
+ 'copy',
 
 
292
  '-t',
293
+ formatDuration(audioDuration),
 
294
  '-movflags',
295
  '+faststart',
296
  outputPath,
297
  ],
298
+ 'Final audio-master duration fix',
299
  );
300
 
301
  return {
302
  path: outputPath,
303
+ durationSeconds: audioDuration,
304
  };
305
  }
306
 
 
424
  return segments;
425
  }
426
 
427
+ function sortByStep(clips: PreparedClip[]) {
428
+ return [...clips].sort((a, b) => a.step - b.step);
429
+ }
430
+
431
+ function buildBalancedVideoSegments(videoClips: PreparedClip[], targetSeconds: number) {
432
+ const orderedClips = sortByStep(videoClips);
433
+ const segments: VideoSegment[] = [];
434
+ const budgets = createClipBudgets(orderedClips);
435
+ let remaining = targetSeconds;
436
+ let activeSources = orderedClips.filter((clip) => usableClipDuration(clip) >= 0.25);
437
+ let guard = 0;
438
+
439
+ while (remaining >= 0.25 && activeSources.length > 0 && guard < 20) {
440
+ guard += 1;
441
+ const share = remaining / activeSources.length;
442
+ let addedThisRound = 0;
443
+
444
+ for (const source of activeSources) {
445
+ if (remaining < 0.25) break;
446
+
447
+ const remainingBudget = budgets.get(source.path) ?? usableClipDuration(source);
448
+ if (remainingBudget < 0.25) continue;
449
+
450
+ const requestedSeconds = Math.min(share, remainingBudget, remaining);
451
+ const added = addSegmentWithBudget(segments, budgets, source, requestedSeconds);
452
+ addedThisRound += added;
453
+ remaining -= added;
454
+ }
455
+
456
+ if (addedThisRound < 0.25) break;
457
+ activeSources = activeSources.filter((source) => {
458
+ const remainingBudget = budgets.get(source.path) ?? 0;
459
+ return remainingBudget >= 0.25;
460
+ });
461
+ }
462
+
463
+ if (remaining >= 0.25) {
464
+ addRotatingSegments(segments, orderedClips, remaining);
465
+ }
466
+
467
+ return segments;
468
+ }
469
+
470
  function addAudioSegment(
471
  segments: AudioSegment[],
472
  source: PreparedClip | null,
 
490
  }): VoiceAwarePlan {
491
  if (videoClips.length === 0) return { videoSegments: [], audioSegments: [] };
492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  const targetSeconds = clamp(renderTargetSeconds, 1, VOICEOVER_RENDER_SAFETY_MAX_SECONDS);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  const audioSegments: AudioSegment[] = [];
495
+ let plannedAudioSeconds = 0;
496
+
497
+ for (const audioClip of sortByStep(audioClips)) {
498
+ const remainingTarget = Math.max(0, targetSeconds - plannedAudioSeconds);
499
+ const duration =
500
+ audioClip.duration > 0
501
+ ? audioClip.duration
502
+ : Math.min(MAX_AUDIO_CLIP_SECONDS, remainingTarget || MAX_AUDIO_CLIP_SECONDS);
503
+ addAudioSegment(audioSegments, audioClip, duration);
504
+ plannedAudioSeconds += duration;
505
  }
 
 
 
 
 
 
506
 
507
+ const videoSegments = buildBalancedVideoSegments(videoClips, targetSeconds);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
  return {
510
+ videoSegments: videoSegments.length > 0 ? videoSegments : buildLinearSegments(videoClips),
511
  audioSegments,
512
  };
513
  }
 
735
  '96k',
736
  '-t',
737
  formatDuration(renderTargetSeconds),
 
738
  '-movflags',
739
  '+faststart',
740
  '-avoid_negative_ts',
 
744
  'Server clip render',
745
  );
746
 
747
+ const finalOutput = await enforceAudioMasterOutput(outputPath, fixedOutputPath);
748
  const bytes = await readFile(finalOutput.path);
749
  if (bytes.length <= 0) throw new Error('Server render produced an empty video.');
750