Grabby-Voice-v3 / src /lib /server /serverClipRenderer.ts
moonlantern1's picture
Deploy Grabby Voice V3 simplified clip flow
0f0d9a4 verified
import { existsSync } from 'fs';
import { mkdir, readFile, rm, writeFile } from 'fs/promises';
import path from 'path';
import { spawn } from 'child_process';
type ClipFile = {
file?: File;
filePath?: string;
ext: string;
step?: number;
};
export type ServerRenderInput = {
videoClips: ClipFile[];
audioClips: ClipFile[];
};
export type ServerRenderResult = {
bytes: Buffer;
durationSeconds: number;
filename: string;
};
const WORK_DIR = path.join(process.cwd(), '.local-review-data', 'server-renders');
const VIDEO_WIDTH = 1080;
const VIDEO_HEIGHT = 1920;
const VIDEO_FPS = 24;
const MIN_VIDEO_CLIP_SECONDS = 5;
const MAX_VIDEO_CLIP_SECONDS = 7;
const MAX_AUDIO_CLIP_SECONDS = 12;
const FINAL_VIDEO_MAX_SECONDS = 17;
const STEP_VIDEO_MAX_SECONDS: Record<number, number> = {
1: 6,
2: 7,
3: 4,
};
type PreparedClip = {
path: string;
step: number;
duration: number;
hasAudio: boolean;
};
type VideoSegment = {
source: PreparedClip;
duration: number;
};
function resolveMediaCommand(command: 'ffmpeg' | 'ffprobe') {
const explicit =
command === 'ffmpeg' ? process.env.GRABBY_FFMPEG_PATH : process.env.GRABBY_FFPROBE_PATH;
if (explicit && existsSync(explicit)) return explicit;
const tempRoot = process.env.TEMP || process.env.TMP;
if (process.platform === 'win32' && tempRoot) {
const tempTools = path.join(tempRoot, 'grabby-media-tools', 'node_modules');
const candidate =
command === 'ffmpeg'
? path.join(tempTools, 'ffmpeg-static', 'ffmpeg.exe')
: path.join(tempTools, 'ffprobe-static', 'bin', 'win32', 'x64', 'ffprobe.exe');
if (existsSync(candidate)) return candidate;
}
return command;
}
function safeExt(ext: string) {
const normalized = ext.toLowerCase().replace(/[^a-z0-9]/g, '');
if (normalized === 'mp4' || normalized === 'mov' || normalized === 'webm') return normalized;
if (normalized === 'm4a' || normalized === 'mp3' || normalized === 'wav') return normalized;
return 'webm';
}
async function runCommand(command: string, args: string[], label: string) {
return await new Promise<void>((resolve, reject) => {
const child = spawn(resolveMediaCommand(command === 'ffprobe' ? 'ffprobe' : 'ffmpeg'), args, {
windowsHide: true,
stdio: ['ignore', 'pipe', 'pipe'],
});
const logs: string[] = [];
const collect = (chunk: Buffer) => {
const text = chunk.toString('utf8');
for (const line of text.split(/\r?\n/)) {
const trimmed = line.trim();
if (trimmed) logs.push(trimmed);
}
if (logs.length > 80) logs.splice(0, logs.length - 80);
};
child.stdout.on('data', collect);
child.stderr.on('data', collect);
child.on('error', (err) => {
reject(new Error(`${label} failed to start: ${err.message}`));
});
child.on('close', (code) => {
if (code === 0) {
resolve();
return;
}
reject(
new Error(
logs.length
? `${label} failed with exit code ${code}: ${logs.slice(-10).join(' | ')}`
: `${label} failed with exit code ${code}`,
),
);
});
});
}
async function writeClip(file: File, filePath: string) {
const bytes = Buffer.from(await file.arrayBuffer());
if (bytes.length <= 0) throw new Error('One of the clips was empty.');
await writeFile(filePath, bytes);
}
async function prepareClipSource(clip: ClipFile, fallbackPath: string) {
if (clip.filePath) return clip.filePath;
if (!clip.file) throw new Error('Clip source is missing.');
await writeClip(clip.file, fallbackPath);
return fallbackPath;
}
async function probeDuration(filePath: string) {
try {
const args = [
'-v',
'error',
'-show_entries',
'format=duration',
'-of',
'default=noprint_wrappers=1:nokey=1',
filePath,
];
const output = await new Promise<string>((resolve, reject) => {
const child = spawn(resolveMediaCommand('ffprobe'), args, {
windowsHide: true,
stdio: ['ignore', 'pipe', 'pipe'],
});
let stdout = '';
child.stdout.on('data', (chunk: Buffer) => {
stdout += chunk.toString('utf8');
});
child.on('error', reject);
child.on('close', (code) => {
if (code === 0) resolve(stdout.trim());
else reject(new Error(`ffprobe exited with ${code}`));
});
});
const duration = Number(output);
return Number.isFinite(duration) ? Math.max(0, duration) : 0;
} catch {
return 0;
}
}
async function probeHasAudio(filePath: string) {
try {
const args = [
'-v',
'error',
'-select_streams',
'a',
'-show_entries',
'stream=index',
'-of',
'csv=p=0',
filePath,
];
const output = await new Promise<string>((resolve, reject) => {
const child = spawn(resolveMediaCommand('ffprobe'), args, {
windowsHide: true,
stdio: ['ignore', 'pipe', 'pipe'],
});
let stdout = '';
child.stdout.on('data', (chunk: Buffer) => {
stdout += chunk.toString('utf8');
});
child.on('error', reject);
child.on('close', (code) => {
if (code === 0) resolve(stdout.trim());
else reject(new Error(`ffprobe exited with ${code}`));
});
});
return output.length > 0;
} catch {
return false;
}
}
function formatDuration(seconds: number) {
return seconds.toFixed(3).replace(/\.?0+$/, '');
}
function clamp(value: number, min: number, max: number) {
return Math.max(min, Math.min(max, value));
}
function inferClipStep(clip: ClipFile, index: number, fallbackStartStep: number) {
return Number.isInteger(clip.step) && clip.step! > 0 ? clip.step! : fallbackStartStep + index;
}
function clipByStep(clips: PreparedClip[], step: number) {
return clips.find((clip) => clip.step === step) ?? null;
}
function maxSecondsForStep(step: number) {
return STEP_VIDEO_MAX_SECONDS[step] ?? MAX_VIDEO_CLIP_SECONDS;
}
function addSegment(segments: VideoSegment[], source: PreparedClip | null, duration: number) {
if (!source || duration < 0.25) return;
segments.push({
source,
duration,
});
}
function buildLinearSegments(videoClips: PreparedClip[]) {
const segments: VideoSegment[] = [];
let remaining = FINAL_VIDEO_MAX_SECONDS;
for (const source of videoClips) {
if (remaining <= 0.25) break;
const stepMax = maxSecondsForStep(source.step);
const fallbackDuration = Math.min(MIN_VIDEO_CLIP_SECONDS, stepMax);
const sourceDuration =
source.duration > 0 ? Math.min(source.duration, stepMax) : fallbackDuration;
const duration = Math.min(sourceDuration, remaining);
addSegment(segments, source, duration);
remaining -= duration;
}
return segments;
}
function buildVoiceAwareSegments({
videoClips,
audioClips,
renderTargetSeconds,
}: {
videoClips: PreparedClip[];
audioClips: PreparedClip[];
renderTargetSeconds: number;
}) {
if (videoClips.length === 0) return [];
const closeShot = clipByStep(videoClips, 1) ?? videoClips[0]!;
const wideShot = clipByStep(videoClips, 2) ?? videoClips[1] ?? closeShot;
const actionShot = clipByStep(videoClips, 3) ?? videoClips[2] ?? wideShot;
const reactionShot =
videoClips.find((clip) => clip.step >= 6) ??
(videoClips.length > 3 ? videoClips[videoClips.length - 1]! : null);
const orderAudio = clipByStep(audioClips, 4) ?? audioClips[0] ?? null;
const likedAudio =
clipByStep(audioClips, 5) ??
audioClips.find((clip) => clip !== orderAudio) ??
null;
const targetSeconds = Math.max(1, renderTargetSeconds);
const reactionSeconds =
reactionShot && targetSeconds >= 10
? Math.min(3, reactionShot.duration > 0 ? reactionShot.duration : 3)
: 0;
const narrativeSeconds = Math.max(1, targetSeconds - reactionSeconds);
const fallbackOrderSeconds = likedAudio ? 7 : narrativeSeconds * 0.42;
const rawOrderSeconds = orderAudio?.duration && orderAudio.duration > 0
? orderAudio.duration
: fallbackOrderSeconds;
const minOrderSeconds = Math.min(2, narrativeSeconds);
const maxOrderSeconds = likedAudio ? Math.max(minOrderSeconds, narrativeSeconds - 2) : narrativeSeconds;
const orderSeconds = clamp(rawOrderSeconds, minOrderSeconds, maxOrderSeconds);
const likedSeconds = Math.max(0, narrativeSeconds - orderSeconds);
const segments: VideoSegment[] = [];
if (closeShot === wideShot || orderSeconds < 2.5) {
addSegment(segments, closeShot, orderSeconds);
} else {
const closeSeconds = clamp(orderSeconds * 0.58, 1.25, orderSeconds - 0.75);
addSegment(segments, closeShot, closeSeconds);
addSegment(segments, wideShot, orderSeconds - closeSeconds);
}
addSegment(segments, actionShot, likedSeconds);
addSegment(segments, reactionShot, reactionSeconds);
return segments.length > 0 ? segments : buildLinearSegments(videoClips);
}
export async function renderClipsOnServer(input: ServerRenderInput): Promise<ServerRenderResult> {
if (input.videoClips.length === 0) {
throw new Error('No video clips were uploaded.');
}
const runId = Math.random().toString(36).slice(2, 10);
const runDir = path.join(WORK_DIR, runId);
await mkdir(runDir, { recursive: true });
const videoPaths: Array<{ path: string; step: number }> = [];
const audioPaths: Array<{ path: string; step: number }> = [];
const outputPath = path.join(runDir, `matcha-server-${runId}.mp4`);
try {
for (let i = 0; i < input.videoClips.length; i++) {
const clip = input.videoClips[i]!;
const filePath = path.join(runDir, `video-${i}.${safeExt(clip.ext)}`);
videoPaths.push({
path: await prepareClipSource(clip, filePath),
step: inferClipStep(clip, i, 1),
});
}
for (let i = 0; i < input.audioClips.length; i++) {
const clip = input.audioClips[i]!;
const filePath = path.join(runDir, `audio-${i}.${safeExt(clip.ext)}`);
audioPaths.push({
path: await prepareClipSource(clip, filePath),
step: inferClipStep(clip, i, 4),
});
}
const [videoDurations, audioDurations, videoAudioFlags] = await Promise.all([
Promise.all(videoPaths.map((clip) => probeDuration(clip.path))),
Promise.all(audioPaths.map((clip) => probeDuration(clip.path))),
Promise.all(videoPaths.map((clip) => probeHasAudio(clip.path))),
]);
const preparedVideoClips = videoPaths.map((clip, index) => ({
path: clip.path,
step: clip.step,
duration: videoDurations[index] ?? 0,
hasAudio: videoAudioFlags[index] ?? false,
}));
const preparedAudioClips = audioPaths.map((clip, index) => ({
path: clip.path,
step: clip.step,
duration: audioDurations[index] ?? 0,
hasAudio: true,
}));
const hasVoiceover = audioPaths.length > 0;
const audioDurationSeconds = audioDurations.reduce((total, duration) => total + duration, 0);
const voiceoverTargetSeconds =
Math.max(
1,
Math.min(
FINAL_VIDEO_MAX_SECONDS,
audioDurationSeconds > 0
? audioDurationSeconds
: preparedAudioClips.length >= 2
? FINAL_VIDEO_MAX_SECONDS
: MAX_AUDIO_CLIP_SECONDS,
),
);
const videoSegments = hasVoiceover
? buildVoiceAwareSegments({
videoClips: preparedVideoClips,
audioClips: preparedAudioClips,
renderTargetSeconds: voiceoverTargetSeconds,
})
: buildLinearSegments(preparedVideoClips);
const renderTargetSeconds = hasVoiceover
? voiceoverTargetSeconds
: Math.max(
1,
Math.min(
FINAL_VIDEO_MAX_SECONDS,
videoSegments.reduce((total, segment) => total + segment.duration, 0),
),
);
if (videoSegments.length === 0) {
throw new Error('No usable video segments were uploaded.');
}
const inputArgs = [
...videoSegments.flatMap((segment) => [
'-stream_loop',
'-1',
'-t',
formatDuration(segment.duration),
'-i',
segment.source.path,
]),
...audioPaths.flatMap((clip) => [
'-t',
String(MAX_AUDIO_CLIP_SECONDS),
'-i',
clip.path,
]),
];
const videoFilters = videoSegments
.map((segment, i) => {
return (
`[${i}:v]trim=duration=${formatDuration(segment.duration)},setpts=PTS-STARTPTS,` +
`scale=${VIDEO_WIDTH}:${VIDEO_HEIGHT}:force_original_aspect_ratio=decrease,` +
`pad=${VIDEO_WIDTH}:${VIDEO_HEIGHT}:(ow-iw)/2:(oh-ih)/2,` +
`setsar=1,fps=${VIDEO_FPS},format=yuv420p[v${i}]`
);
})
.join(';');
const videoInputs = videoSegments.map((_, i) => `[v${i}]`).join('');
const audioOffset = videoSegments.length;
let filterComplex: string;
if (hasVoiceover) {
const videoConcat = `${videoInputs}concat=n=${videoSegments.length}:v=1:a=0[vcat]`;
const videoFinalize = `[vcat]trim=duration=${formatDuration(
renderTargetSeconds,
)},setpts=PTS-STARTPTS[v]`;
const audioFilters = audioPaths
.map((_, i) => {
return (
`[${audioOffset + i}:a]atrim=duration=${MAX_AUDIO_CLIP_SECONDS},` +
`aresample=48000,aformat=sample_rates=48000:channel_layouts=mono,` +
`asetpts=PTS-STARTPTS[a${i}]`
);
})
.join(';');
const audioInputs = audioPaths.map((_, i) => `[a${i}]`).join('');
const audioConcat = `${audioInputs}concat=n=${audioPaths.length}:v=0:a=1[acat]`;
const audioFinalize =
`[acat]atrim=duration=${formatDuration(renderTargetSeconds)},` +
`apad=whole_dur=${formatDuration(renderTargetSeconds)},asetpts=PTS-STARTPTS[a]`;
filterComplex = [
videoFilters,
videoConcat,
videoFinalize,
audioFilters,
audioConcat,
audioFinalize,
]
.filter(Boolean)
.join(';');
} else {
const embeddedAudioFilters = videoSegments
.map((segment, i) => {
if (segment.source.hasAudio) {
return (
`[${i}:a]atrim=duration=${formatDuration(segment.duration)},` +
`aresample=48000,aformat=sample_rates=48000:channel_layouts=mono,` +
`asetpts=PTS-STARTPTS[a${i}]`
);
}
return (
`anullsrc=r=48000:cl=mono,atrim=duration=${formatDuration(segment.duration)},` +
`aformat=sample_rates=48000:channel_layouts=mono,asetpts=PTS-STARTPTS[a${i}]`
);
})
.join(';');
const avInputs = videoSegments.map((_, i) => `[v${i}][a${i}]`).join('');
const avConcat = `${avInputs}concat=n=${videoSegments.length}:v=1:a=1[vcat][acat]`;
const videoFinalize = `[vcat]trim=duration=${formatDuration(
renderTargetSeconds,
)},setpts=PTS-STARTPTS[v]`;
const audioFinalize =
`[acat]atrim=duration=${formatDuration(renderTargetSeconds)},` +
`apad=whole_dur=${formatDuration(renderTargetSeconds)},asetpts=PTS-STARTPTS[a]`;
filterComplex = [
videoFilters,
embeddedAudioFilters,
avConcat,
videoFinalize,
audioFinalize,
]
.filter(Boolean)
.join(';');
}
// V3 records picture and speech together. Older voiceover renders still use
// separate audio clips, but both paths now emit one shared audio/video length.
const outputArgs = ['-map', '[v]', '-map', '[a]'];
await runCommand(
'ffmpeg',
[
'-y',
...inputArgs,
'-filter_complex',
filterComplex,
...outputArgs,
'-c:v',
'libx264',
'-preset',
'ultrafast',
'-tune',
'zerolatency',
'-crf',
'30',
'-r',
String(VIDEO_FPS),
'-c:a',
'aac',
'-b:a',
'96k',
'-t',
formatDuration(renderTargetSeconds),
'-movflags',
'+faststart',
'-avoid_negative_ts',
'make_zero',
outputPath,
],
'Server clip render',
);
const bytes = await readFile(outputPath);
if (bytes.length <= 0) throw new Error('Server render produced an empty video.');
return {
bytes,
durationSeconds: Math.round(await probeDuration(outputPath)),
filename: `matcha-server-${runId}.mp4`,
};
} finally {
await rm(runDir, { recursive: true, force: true }).catch(() => undefined);
}
}