Grabby-Voice-Classic / src /lib /server /serverClipRenderer.ts
moonlantern1's picture
Use recorded durations for clip rendering
513b862
import { existsSync } from 'fs';
import { mkdir, readFile, rm, writeFile } from 'fs/promises';
import path from 'path';
import { spawn } from 'child_process';
type ClipFile = {
file?: File;
filePath?: string;
ext: string;
step?: number;
durationSeconds?: number;
};
export type ServerRenderInput = {
videoClips: ClipFile[];
audioClips: ClipFile[];
};
export type ServerRenderResult = {
bytes: Buffer;
durationSeconds: number;
filename: string;
};
const WORK_DIR = path.join(process.cwd(), '.local-review-data', 'server-renders');
const VIDEO_WIDTH = 1080;
const VIDEO_HEIGHT = 1920;
const VIDEO_FPS = 24;
const MAX_VIDEO_CLIP_SECONDS = 10;
const MAX_AUDIO_CLIP_SECONDS = 10;
const STEP_VIDEO_MAX_SECONDS: Record<number, number> = {
1: 10,
2: 10,
3: 10,
4: 10,
5: 10,
};
type PreparedClip = {
path: string;
step: number;
duration: number;
hasAudio: boolean;
};
type VideoSegment = {
source: PreparedClip;
duration: number;
};
type AudioSegment = {
source: PreparedClip;
duration: number;
};
type VoiceAwarePlan = {
videoSegments: VideoSegment[];
audioSegments: AudioSegment[];
};
function resolveMediaCommand(command: 'ffmpeg' | 'ffprobe') {
const explicit =
command === 'ffmpeg' ? process.env.GRABBY_FFMPEG_PATH : process.env.GRABBY_FFPROBE_PATH;
if (explicit && existsSync(explicit)) return explicit;
const tempRoot = process.env.TEMP || process.env.TMP;
if (process.platform === 'win32' && tempRoot) {
const tempTools = path.join(tempRoot, 'grabby-media-tools', 'node_modules');
const candidate =
command === 'ffmpeg'
? path.join(tempTools, 'ffmpeg-static', 'ffmpeg.exe')
: path.join(tempTools, 'ffprobe-static', 'bin', 'win32', 'x64', 'ffprobe.exe');
if (existsSync(candidate)) return candidate;
}
return command;
}
function safeExt(ext: string) {
const normalized = ext.toLowerCase().replace(/[^a-z0-9]/g, '');
if (normalized === 'mp4' || normalized === 'mov' || normalized === 'webm') return normalized;
if (normalized === 'm4a' || normalized === 'mp3' || normalized === 'wav') return normalized;
return 'webm';
}
function sanitizeClientDuration(durationSeconds: number | undefined) {
if (!Number.isFinite(durationSeconds) || durationSeconds === undefined) return 0;
if (durationSeconds <= 0 || durationSeconds > 600) return 0;
return durationSeconds;
}
function resolveRecordedDuration(probedSeconds: number, clientSeconds: number | undefined) {
const probed = Number.isFinite(probedSeconds) && probedSeconds > 0 ? probedSeconds : 0;
const client = sanitizeClientDuration(clientSeconds);
// The browser timer reflects when the user actually tapped stop. Some mobile
// recordings report the prompt cap as media duration, so client duration wins.
if (client > 0) return client;
return probed;
}
async function runCommand(command: string, args: string[], label: string) {
return await new Promise<void>((resolve, reject) => {
const child = spawn(resolveMediaCommand(command === 'ffprobe' ? 'ffprobe' : 'ffmpeg'), args, {
windowsHide: true,
stdio: ['ignore', 'pipe', 'pipe'],
});
const logs: string[] = [];
const collect = (chunk: Buffer) => {
const text = chunk.toString('utf8');
for (const line of text.split(/\r?\n/)) {
const trimmed = line.trim();
if (trimmed) logs.push(trimmed);
}
if (logs.length > 80) logs.splice(0, logs.length - 80);
};
child.stdout.on('data', collect);
child.stderr.on('data', collect);
child.on('error', (err) => {
reject(new Error(`${label} failed to start: ${err.message}`));
});
child.on('close', (code) => {
if (code === 0) {
resolve();
return;
}
reject(
new Error(
logs.length
? `${label} failed with exit code ${code}: ${logs.slice(-10).join(' | ')}`
: `${label} failed with exit code ${code}`,
),
);
});
});
}
async function writeClip(file: File, filePath: string) {
const bytes = Buffer.from(await file.arrayBuffer());
if (bytes.length <= 0) throw new Error('One of the clips was empty.');
await writeFile(filePath, bytes);
}
async function prepareClipSource(clip: ClipFile, fallbackPath: string) {
if (clip.filePath) return clip.filePath;
if (!clip.file) throw new Error('Clip source is missing.');
await writeClip(clip.file, fallbackPath);
return fallbackPath;
}
async function probeFormatDuration(filePath: string) {
try {
const args = [
'-v',
'error',
'-show_entries',
'format=duration',
'-of',
'default=noprint_wrappers=1:nokey=1',
filePath,
];
const output = await new Promise<string>((resolve, reject) => {
const child = spawn(resolveMediaCommand('ffprobe'), args, {
windowsHide: true,
stdio: ['ignore', 'pipe', 'pipe'],
});
let stdout = '';
child.stdout.on('data', (chunk: Buffer) => {
stdout += chunk.toString('utf8');
});
child.on('error', reject);
child.on('close', (code) => {
if (code === 0) resolve(stdout.trim());
else reject(new Error(`ffprobe exited with ${code}`));
});
});
const duration = Number(output);
return Number.isFinite(duration) ? Math.max(0, duration) : 0;
} catch {
return 0;
}
}
async function probeStreamDuration(filePath: string, streamSelector: 'v:0' | 'a:0') {
try {
const args = [
'-v',
'error',
'-select_streams',
streamSelector,
'-show_entries',
'stream=duration',
'-of',
'default=noprint_wrappers=1:nokey=1',
filePath,
];
const output = await new Promise<string>((resolve, reject) => {
const child = spawn(resolveMediaCommand('ffprobe'), args, {
windowsHide: true,
stdio: ['ignore', 'pipe', 'pipe'],
});
let stdout = '';
child.stdout.on('data', (chunk: Buffer) => {
stdout += chunk.toString('utf8');
});
child.on('error', reject);
child.on('close', (code) => {
if (code === 0) resolve(stdout.trim());
else reject(new Error(`ffprobe exited with ${code}`));
});
});
const duration = Number(output.split(/\r?\n/).find(Boolean));
if (Number.isFinite(duration) && duration > 0) return duration;
return await probeFormatDuration(filePath);
} catch {
return await probeFormatDuration(filePath);
}
}
async function probeHasAudio(filePath: string) {
try {
const args = [
'-v',
'error',
'-select_streams',
'a',
'-show_entries',
'stream=index',
'-of',
'csv=p=0',
filePath,
];
const output = await new Promise<string>((resolve, reject) => {
const child = spawn(resolveMediaCommand('ffprobe'), args, {
windowsHide: true,
stdio: ['ignore', 'pipe', 'pipe'],
});
let stdout = '';
child.stdout.on('data', (chunk: Buffer) => {
stdout += chunk.toString('utf8');
});
child.on('error', reject);
child.on('close', (code) => {
if (code === 0) resolve(stdout.trim());
else reject(new Error(`ffprobe exited with ${code}`));
});
});
return output.length > 0;
} catch {
return false;
}
}
async function enforceAudioMasterOutput(inputPath: string, outputPath: string) {
const [videoDuration, audioDuration, formatDurationValue] = await Promise.all([
probeStreamDuration(inputPath, 'v:0'),
probeStreamDuration(inputPath, 'a:0'),
probeFormatDuration(inputPath),
]);
if (audioDuration <= 0 || videoDuration <= 0) {
return {
path: inputPath,
durationSeconds: Math.max(videoDuration, formatDurationValue),
};
}
if (videoDuration + 0.08 >= audioDuration && formatDurationValue <= audioDuration + 0.2) {
return {
path: inputPath,
durationSeconds: audioDuration,
};
}
const videoPadSeconds = Math.max(0, audioDuration - videoDuration);
const videoFilter =
videoPadSeconds > 0.08
? `[0:v]tpad=stop_mode=clone:stop_duration=${formatDuration(
videoPadSeconds,
)},trim=duration=${formatDuration(audioDuration)},setpts=PTS-STARTPTS[v]`
: `[0:v]trim=duration=${formatDuration(audioDuration)},setpts=PTS-STARTPTS[v]`;
await runCommand(
'ffmpeg',
[
'-y',
'-i',
inputPath,
'-filter_complex',
videoFilter,
'-map',
'[v]',
'-map',
'0:a:0',
'-c:v',
'libx264',
'-preset',
'ultrafast',
'-crf',
'30',
'-r',
String(VIDEO_FPS),
'-c:a',
'copy',
'-t',
formatDuration(audioDuration),
'-movflags',
'+faststart',
outputPath,
],
'Final audio-master duration fix',
);
return {
path: outputPath,
durationSeconds: audioDuration,
};
}
function formatDuration(seconds: number) {
return seconds.toFixed(3).replace(/\.?0+$/, '');
}
function inferClipStep(clip: ClipFile, index: number, fallbackStartStep: number) {
return Number.isInteger(clip.step) && clip.step! > 0 ? clip.step! : fallbackStartStep + index;
}
function clipByStep(clips: PreparedClip[], step: number) {
return clips.find((clip) => clip.step === step) ?? null;
}
function maxSecondsForStep(step: number) {
return STEP_VIDEO_MAX_SECONDS[step] ?? MAX_VIDEO_CLIP_SECONDS;
}
function usableClipDuration(source: PreparedClip) {
const stepMax = maxSecondsForStep(source.step);
if (source.duration > 0) return Math.min(source.duration, stepMax);
return 0;
}
function totalUsableVideoDuration(videoClips: PreparedClip[]) {
return videoClips.reduce((total, clip) => total + usableClipDuration(clip), 0);
}
function addSegment(segments: VideoSegment[], source: PreparedClip | null, duration: number) {
if (!source || duration < 0.25) return;
segments.push({
source,
duration,
});
}
function createClipBudgets(videoClips: PreparedClip[]) {
const budgets = new Map<string, number>();
for (const clip of videoClips) {
budgets.set(clip.path, Math.max(budgets.get(clip.path) ?? 0, usableClipDuration(clip)));
}
return budgets;
}
function addSegmentWithBudget(
segments: VideoSegment[],
budgets: Map<string, number>,
source: PreparedClip | null,
duration: number,
) {
if (!source || duration < 0.25) return 0;
const remaining = budgets.get(source.path) ?? usableClipDuration(source);
const actualDuration = Math.min(duration, remaining);
if (actualDuration < 0.25) return 0;
budgets.set(source.path, Math.max(0, remaining - actualDuration));
addSegment(segments, source, actualDuration);
return actualDuration;
}
function uniqueSources(sources: Array<PreparedClip | null>) {
const seen = new Set<string>();
return sources.filter((source): source is PreparedClip => {
if (!source || seen.has(source.path) || usableClipDuration(source) < 0.25) return false;
seen.add(source.path);
return true;
});
}
function addRotatingSegments(
segments: VideoSegment[],
sources: Array<PreparedClip | null>,
duration: number,
) {
const candidates = uniqueSources(sources);
if (duration < 0.25 || candidates.length === 0) return 0;
let added = 0;
let remaining = duration;
let cursor = 0;
let guard = 0;
while (remaining >= 0.25 && guard < 200) {
guard += 1;
let source = candidates[cursor % candidates.length]!;
if (
candidates.length > 1 &&
segments[segments.length - 1]?.source.path === source.path
) {
cursor += 1;
source = candidates[cursor % candidates.length]!;
}
const sourceDuration = usableClipDuration(source);
const chunkDuration = Math.min(remaining, sourceDuration, candidates.length > 1 ? 1.35 : 1);
if (chunkDuration < 0.25) break;
addSegment(segments, source, chunkDuration);
added += chunkDuration;
remaining -= chunkDuration;
cursor += 1;
}
return added;
}
function buildLinearSegments(videoClips: PreparedClip[], targetSeconds = totalUsableVideoDuration(videoClips)) {
const segments: VideoSegment[] = [];
let remaining = Math.max(0, targetSeconds);
for (const source of videoClips) {
if (remaining <= 0.25) break;
const sourceDuration = usableClipDuration(source);
const duration = Math.min(sourceDuration, remaining);
addSegment(segments, source, duration);
remaining -= duration;
}
return segments;
}
function sortByStep(clips: PreparedClip[]) {
return [...clips].sort((a, b) => a.step - b.step);
}
function buildBalancedVideoSegments(videoClips: PreparedClip[], targetSeconds: number) {
const orderedClips = sortByStep(videoClips);
const segments: VideoSegment[] = [];
const budgets = createClipBudgets(orderedClips);
let remaining = targetSeconds;
let activeSources = orderedClips.filter((clip) => usableClipDuration(clip) >= 0.25);
let guard = 0;
while (remaining >= 0.25 && activeSources.length > 0 && guard < 20) {
guard += 1;
const share = remaining / activeSources.length;
let addedThisRound = 0;
for (const source of activeSources) {
if (remaining < 0.25) break;
const remainingBudget = budgets.get(source.path) ?? usableClipDuration(source);
if (remainingBudget < 0.25) continue;
const requestedSeconds = Math.min(share, remainingBudget, remaining);
const added = addSegmentWithBudget(segments, budgets, source, requestedSeconds);
addedThisRound += added;
remaining -= added;
}
if (addedThisRound < 0.25) break;
activeSources = activeSources.filter((source) => {
const remainingBudget = budgets.get(source.path) ?? 0;
return remainingBudget >= 0.25;
});
}
if (remaining >= 0.25) {
addRotatingSegments(segments, orderedClips, remaining);
}
return segments;
}
function addAudioSegment(
segments: AudioSegment[],
source: PreparedClip | null,
duration: number,
) {
if (!source || duration < 0.25) return;
segments.push({
source,
duration: Math.min(duration, source.duration > 0 ? source.duration : duration),
});
}
function buildVoiceAwarePlan({
videoClips,
audioClips,
renderTargetSeconds,
}: {
videoClips: PreparedClip[];
audioClips: PreparedClip[];
renderTargetSeconds: number;
}): VoiceAwarePlan {
if (videoClips.length === 0) return { videoSegments: [], audioSegments: [] };
const targetSeconds = Math.max(1, renderTargetSeconds);
const audioSegments: AudioSegment[] = [];
let plannedAudioSeconds = 0;
for (const audioClip of sortByStep(audioClips)) {
if (audioClip.duration <= 0) continue;
const remainingTarget = Math.max(0, targetSeconds - plannedAudioSeconds);
if (remainingTarget < 0.25) break;
const duration = Math.min(audioClip.duration, remainingTarget);
addAudioSegment(audioSegments, audioClip, duration);
plannedAudioSeconds += duration;
}
const videoSegments = buildBalancedVideoSegments(videoClips, targetSeconds);
return {
videoSegments: videoSegments.length > 0 ? videoSegments : buildLinearSegments(videoClips, targetSeconds),
audioSegments,
};
}
export async function renderClipsOnServer(input: ServerRenderInput): Promise<ServerRenderResult> {
if (input.videoClips.length === 0) {
throw new Error('No video clips were uploaded.');
}
const runId = Math.random().toString(36).slice(2, 10);
const runDir = path.join(WORK_DIR, runId);
await mkdir(runDir, { recursive: true });
const videoPaths: Array<{ path: string; step: number; durationSeconds?: number }> = [];
const audioPaths: Array<{ path: string; step: number; durationSeconds?: number }> = [];
const outputPath = path.join(runDir, `matcha-server-${runId}.mp4`);
const fixedOutputPath = path.join(runDir, `matcha-server-${runId}-fixed.mp4`);
try {
for (let i = 0; i < input.videoClips.length; i++) {
const clip = input.videoClips[i]!;
const filePath = path.join(runDir, `video-${i}.${safeExt(clip.ext)}`);
videoPaths.push({
path: await prepareClipSource(clip, filePath),
step: inferClipStep(clip, i, 1),
durationSeconds: clip.durationSeconds,
});
}
for (let i = 0; i < input.audioClips.length; i++) {
const clip = input.audioClips[i]!;
const filePath = path.join(runDir, `audio-${i}.${safeExt(clip.ext)}`);
audioPaths.push({
path: await prepareClipSource(clip, filePath),
step: inferClipStep(clip, i, 5),
durationSeconds: clip.durationSeconds,
});
}
const [videoDurations, audioDurations, videoAudioFlags] = await Promise.all([
Promise.all(videoPaths.map((clip) => probeStreamDuration(clip.path, 'v:0'))),
Promise.all(audioPaths.map((clip) => probeStreamDuration(clip.path, 'a:0'))),
Promise.all(videoPaths.map((clip) => probeHasAudio(clip.path))),
]);
const preparedVideoClips = videoPaths.map((clip, index) => ({
path: clip.path,
step: clip.step,
duration: resolveRecordedDuration(videoDurations[index] ?? 0, clip.durationSeconds),
hasAudio: videoAudioFlags[index] ?? false,
}));
const preparedAudioClips = audioPaths.map((clip, index) => ({
path: clip.path,
step: clip.step,
duration: resolveRecordedDuration(audioDurations[index] ?? 0, clip.durationSeconds),
hasAudio: true,
}));
const hasVoiceover = audioPaths.length > 0;
const audioDurationSeconds = preparedAudioClips.reduce(
(total, clip) => total + clip.duration,
0,
);
const fallbackTargetSeconds = Math.min(
MAX_AUDIO_CLIP_SECONDS,
Math.max(1, totalUsableVideoDuration(preparedVideoClips)),
);
const voiceoverTargetSeconds = Math.max(
1,
audioDurationSeconds > 0 ? audioDurationSeconds : fallbackTargetSeconds,
);
const voiceAwarePlan = hasVoiceover
? buildVoiceAwarePlan({
videoClips: preparedVideoClips,
audioClips: preparedAudioClips,
renderTargetSeconds: voiceoverTargetSeconds,
})
: null;
const videoSegments = voiceAwarePlan?.videoSegments ?? buildLinearSegments(preparedVideoClips);
const audioSegments = voiceAwarePlan?.audioSegments ?? [];
const visualDurationSeconds = videoSegments.reduce(
(total, segment) => total + segment.duration,
0,
);
const renderTargetSeconds = hasVoiceover
? Math.max(1, voiceoverTargetSeconds)
: Math.max(1, visualDurationSeconds);
if (videoSegments.length === 0) {
throw new Error('No usable video segments were uploaded.');
}
const inputArgs = [
...videoSegments.flatMap((segment) => [
'-t',
formatDuration(segment.duration),
'-i',
segment.source.path,
]),
...audioPaths.flatMap((clip) => [
'-i',
clip.path,
]),
];
const videoFilters = videoSegments
.map((segment, i) => {
return (
`[${i}:v]trim=duration=${formatDuration(segment.duration)},setpts=PTS-STARTPTS,` +
`scale=${VIDEO_WIDTH}:${VIDEO_HEIGHT}:force_original_aspect_ratio=decrease,` +
`pad=${VIDEO_WIDTH}:${VIDEO_HEIGHT}:(ow-iw)/2:(oh-ih)/2,` +
`setsar=1,fps=${VIDEO_FPS},format=yuv420p[v${i}]`
);
})
.join(';');
const videoInputs = videoSegments.map((_, i) => `[v${i}]`).join('');
const audioOffset = videoSegments.length;
const audioInputIndexByPath = new Map(
audioPaths.map((clip, index) => [clip.path, audioOffset + index]),
);
let filterComplex: string;
if (hasVoiceover) {
const videoConcat = `${videoInputs}concat=n=${videoSegments.length}:v=1:a=0[vcat]`;
const videoFinalize = `[vcat]trim=duration=${formatDuration(
renderTargetSeconds,
)},setpts=PTS-STARTPTS[v]`;
const audioFilters = audioSegments
.map((segment, i) => {
const inputIndex = audioInputIndexByPath.get(segment.source.path);
if (inputIndex === undefined) {
throw new Error('Voice clip input was not prepared correctly.');
}
return (
`[${inputIndex}:a]atrim=duration=${formatDuration(segment.duration)},` +
`aresample=48000,aformat=sample_rates=48000:channel_layouts=mono,` +
`asetpts=PTS-STARTPTS[a${i}]`
);
})
.join(';');
const audioInputs = audioSegments.map((_, i) => `[a${i}]`).join('');
const audioConcat =
audioSegments.length > 0
? `${audioInputs}concat=n=${audioSegments.length}:v=0:a=1[acat]`
: `anullsrc=r=48000:cl=mono,atrim=duration=${formatDuration(
renderTargetSeconds,
)},aformat=sample_rates=48000:channel_layouts=mono,asetpts=PTS-STARTPTS[acat]`;
const audioFinalize =
`[acat]atrim=duration=${formatDuration(renderTargetSeconds)},` +
`apad=whole_dur=${formatDuration(renderTargetSeconds)},asetpts=PTS-STARTPTS[a]`;
filterComplex = [
videoFilters,
videoConcat,
videoFinalize,
audioFilters,
audioConcat,
audioFinalize,
]
.filter(Boolean)
.join(';');
} else {
const embeddedAudioFilters = videoSegments
.map((segment, i) => {
if (segment.source.hasAudio) {
return (
`[${i}:a]atrim=duration=${formatDuration(segment.duration)},` +
`aresample=48000,aformat=sample_rates=48000:channel_layouts=mono,` +
`asetpts=PTS-STARTPTS[a${i}]`
);
}
return (
`anullsrc=r=48000:cl=mono,atrim=duration=${formatDuration(segment.duration)},` +
`aformat=sample_rates=48000:channel_layouts=mono,asetpts=PTS-STARTPTS[a${i}]`
);
})
.join(';');
const avInputs = videoSegments.map((_, i) => `[v${i}][a${i}]`).join('');
const avConcat = `${avInputs}concat=n=${videoSegments.length}:v=1:a=1[vcat][acat]`;
const videoFinalize = `[vcat]trim=duration=${formatDuration(
renderTargetSeconds,
)},setpts=PTS-STARTPTS[v]`;
const audioFinalize =
`[acat]atrim=duration=${formatDuration(renderTargetSeconds)},` +
`apad=whole_dur=${formatDuration(renderTargetSeconds)},asetpts=PTS-STARTPTS[a]`;
filterComplex = [
videoFilters,
embeddedAudioFilters,
avConcat,
videoFinalize,
audioFinalize,
]
.filter(Boolean)
.join(';');
}
// Classic uses separate voice notes; the fallback path also supports clips
// that already contain audio. Both paths emit one shared audio/video length.
const outputArgs = ['-map', '[v]', '-map', '[a]'];
await runCommand(
'ffmpeg',
[
'-y',
...inputArgs,
'-filter_complex',
filterComplex,
...outputArgs,
'-c:v',
'libx264',
'-preset',
'ultrafast',
'-tune',
'zerolatency',
'-crf',
'30',
'-r',
String(VIDEO_FPS),
'-c:a',
'aac',
'-b:a',
'96k',
'-t',
formatDuration(renderTargetSeconds),
'-movflags',
'+faststart',
'-avoid_negative_ts',
'make_zero',
outputPath,
],
'Server clip render',
);
const finalOutput = await enforceAudioMasterOutput(outputPath, fixedOutputPath);
const bytes = await readFile(finalOutput.path);
if (bytes.length <= 0) throw new Error('Server render produced an empty video.');
return {
bytes,
durationSeconds: Math.round(finalOutput.durationSeconds),
filename: `matcha-server-${runId}.mp4`,
};
} finally {
await rm(runDir, { recursive: true, force: true }).catch(() => undefined);
}
}