Spaces:

moonlantern1
/

Grabby-Voice-v3

Sleeping

App Files Files Community

Grabby-Voice-v3 / src /lib /server /serverClipRenderer.ts

moonlantern1

Deploy Grabby Voice V3 simplified clip flow

0f0d9a4 verified 8 days ago

raw

history blame contribute delete

16.5 kB

	import { existsSync } from 'fs';
	import { mkdir, readFile, rm, writeFile } from 'fs/promises';
	import path from 'path';
	import { spawn } from 'child_process';

	type ClipFile = {
	file?: File;
	filePath?: string;
	ext: string;
	step?: number;
	};

	export type ServerRenderInput = {
	videoClips: ClipFile[];
	audioClips: ClipFile[];
	};

	export type ServerRenderResult = {
	bytes: Buffer;
	durationSeconds: number;
	filename: string;
	};

	const WORK_DIR = path.join(process.cwd(), '.local-review-data', 'server-renders');
	const VIDEO_WIDTH = 1080;
	const VIDEO_HEIGHT = 1920;
	const VIDEO_FPS = 24;
	const MIN_VIDEO_CLIP_SECONDS = 5;
	const MAX_VIDEO_CLIP_SECONDS = 7;
	const MAX_AUDIO_CLIP_SECONDS = 12;
	const FINAL_VIDEO_MAX_SECONDS = 17;
	const STEP_VIDEO_MAX_SECONDS: Record<number, number> = {
	1: 6,
	2: 7,
	3: 4,
	};

	type PreparedClip = {
	path: string;
	step: number;
	duration: number;
	hasAudio: boolean;
	};

	type VideoSegment = {
	source: PreparedClip;
	duration: number;
	};

	function resolveMediaCommand(command: 'ffmpeg' \| 'ffprobe') {
	const explicit =
	command === 'ffmpeg' ? process.env.GRABBY_FFMPEG_PATH : process.env.GRABBY_FFPROBE_PATH;
	if (explicit && existsSync(explicit)) return explicit;

	const tempRoot = process.env.TEMP \|\| process.env.TMP;
	if (process.platform === 'win32' && tempRoot) {
	const tempTools = path.join(tempRoot, 'grabby-media-tools', 'node_modules');
	const candidate =
	command === 'ffmpeg'
	? path.join(tempTools, 'ffmpeg-static', 'ffmpeg.exe')
	: path.join(tempTools, 'ffprobe-static', 'bin', 'win32', 'x64', 'ffprobe.exe');
	if (existsSync(candidate)) return candidate;
	}

	return command;
	}

	function safeExt(ext: string) {
	const normalized = ext.toLowerCase().replace(/[^a-z0-9]/g, '');
	if (normalized === 'mp4' \|\| normalized === 'mov' \|\| normalized === 'webm') return normalized;
	if (normalized === 'm4a' \|\| normalized === 'mp3' \|\| normalized === 'wav') return normalized;
	return 'webm';
	}

	async function runCommand(command: string, args: string[], label: string) {
	return await new Promise<void>((resolve, reject) => {
	const child = spawn(resolveMediaCommand(command === 'ffprobe' ? 'ffprobe' : 'ffmpeg'), args, {
	windowsHide: true,
	stdio: ['ignore', 'pipe', 'pipe'],
	});

	const logs: string[] = [];
	const collect = (chunk: Buffer) => {
	const text = chunk.toString('utf8');
	for (const line of text.split(/\r?\n/)) {
	const trimmed = line.trim();
	if (trimmed) logs.push(trimmed);
	}
	if (logs.length > 80) logs.splice(0, logs.length - 80);
	};

	child.stdout.on('data', collect);
	child.stderr.on('data', collect);
	child.on('error', (err) => {
	reject(new Error(`${label} failed to start: ${err.message}`));
	});
	child.on('close', (code) => {
	if (code === 0) {
	resolve();
	return;
	}

	reject(
	new Error(
	logs.length
	? `${label} failed with exit code ${code}: ${logs.slice(-10).join(' \| ')}`
	: `${label} failed with exit code ${code}`,
	),
	);
	});
	});
	}

	async function writeClip(file: File, filePath: string) {
	const bytes = Buffer.from(await file.arrayBuffer());
	if (bytes.length <= 0) throw new Error('One of the clips was empty.');
	await writeFile(filePath, bytes);
	}

	async function prepareClipSource(clip: ClipFile, fallbackPath: string) {
	if (clip.filePath) return clip.filePath;
	if (!clip.file) throw new Error('Clip source is missing.');
	await writeClip(clip.file, fallbackPath);
	return fallbackPath;
	}

	async function probeDuration(filePath: string) {
	try {
	const args = [
	'-v',
	'error',
	'-show_entries',
	'format=duration',
	'-of',
	'default=noprint_wrappers=1:nokey=1',
	filePath,
	];
	const output = await new Promise<string>((resolve, reject) => {
	const child = spawn(resolveMediaCommand('ffprobe'), args, {
	windowsHide: true,
	stdio: ['ignore', 'pipe', 'pipe'],
	});
	let stdout = '';
	child.stdout.on('data', (chunk: Buffer) => {
	stdout += chunk.toString('utf8');
	});
	child.on('error', reject);
	child.on('close', (code) => {
	if (code === 0) resolve(stdout.trim());
	else reject(new Error(`ffprobe exited with ${code}`));
	});
	});
	const duration = Number(output);
	return Number.isFinite(duration) ? Math.max(0, duration) : 0;
	} catch {
	return 0;
	}
	}

	async function probeHasAudio(filePath: string) {
	try {
	const args = [
	'-v',
	'error',
	'-select_streams',
	'a',
	'-show_entries',
	'stream=index',
	'-of',
	'csv=p=0',
	filePath,
	];
	const output = await new Promise<string>((resolve, reject) => {
	const child = spawn(resolveMediaCommand('ffprobe'), args, {
	windowsHide: true,
	stdio: ['ignore', 'pipe', 'pipe'],
	});
	let stdout = '';
	child.stdout.on('data', (chunk: Buffer) => {
	stdout += chunk.toString('utf8');
	});
	child.on('error', reject);
	child.on('close', (code) => {
	if (code === 0) resolve(stdout.trim());
	else reject(new Error(`ffprobe exited with ${code}`));
	});
	});
	return output.length > 0;
	} catch {
	return false;
	}
	}

	function formatDuration(seconds: number) {
	return seconds.toFixed(3).replace(/\.?0+$/, '');
	}

	function clamp(value: number, min: number, max: number) {
	return Math.max(min, Math.min(max, value));
	}

	function inferClipStep(clip: ClipFile, index: number, fallbackStartStep: number) {
	return Number.isInteger(clip.step) && clip.step! > 0 ? clip.step! : fallbackStartStep + index;
	}

	function clipByStep(clips: PreparedClip[], step: number) {
	return clips.find((clip) => clip.step === step) ?? null;
	}

	function maxSecondsForStep(step: number) {
	return STEP_VIDEO_MAX_SECONDS[step] ?? MAX_VIDEO_CLIP_SECONDS;
	}

	function addSegment(segments: VideoSegment[], source: PreparedClip \| null, duration: number) {
	if (!source \|\| duration < 0.25) return;
	segments.push({
	source,
	duration,
	});
	}

	function buildLinearSegments(videoClips: PreparedClip[]) {
	const segments: VideoSegment[] = [];
	let remaining = FINAL_VIDEO_MAX_SECONDS;

	for (const source of videoClips) {
	if (remaining <= 0.25) break;
	const stepMax = maxSecondsForStep(source.step);
	const fallbackDuration = Math.min(MIN_VIDEO_CLIP_SECONDS, stepMax);
	const sourceDuration =
	source.duration > 0 ? Math.min(source.duration, stepMax) : fallbackDuration;
	const duration = Math.min(sourceDuration, remaining);
	addSegment(segments, source, duration);
	remaining -= duration;
	}

	return segments;
	}

	function buildVoiceAwareSegments({
	videoClips,
	audioClips,
	renderTargetSeconds,
	}: {
	videoClips: PreparedClip[];
	audioClips: PreparedClip[];
	renderTargetSeconds: number;
	}) {
	if (videoClips.length === 0) return [];

	const closeShot = clipByStep(videoClips, 1) ?? videoClips[0]!;
	const wideShot = clipByStep(videoClips, 2) ?? videoClips[1] ?? closeShot;
	const actionShot = clipByStep(videoClips, 3) ?? videoClips[2] ?? wideShot;
	const reactionShot =
	videoClips.find((clip) => clip.step >= 6) ??
	(videoClips.length > 3 ? videoClips[videoClips.length - 1]! : null);

	const orderAudio = clipByStep(audioClips, 4) ?? audioClips[0] ?? null;
	const likedAudio =
	clipByStep(audioClips, 5) ??
	audioClips.find((clip) => clip !== orderAudio) ??
	null;

	const targetSeconds = Math.max(1, renderTargetSeconds);
	const reactionSeconds =
	reactionShot && targetSeconds >= 10
	? Math.min(3, reactionShot.duration > 0 ? reactionShot.duration : 3)
	: 0;
	const narrativeSeconds = Math.max(1, targetSeconds - reactionSeconds);

	const fallbackOrderSeconds = likedAudio ? 7 : narrativeSeconds * 0.42;
	const rawOrderSeconds = orderAudio?.duration && orderAudio.duration > 0
	? orderAudio.duration
	: fallbackOrderSeconds;
	const minOrderSeconds = Math.min(2, narrativeSeconds);
	const maxOrderSeconds = likedAudio ? Math.max(minOrderSeconds, narrativeSeconds - 2) : narrativeSeconds;
	const orderSeconds = clamp(rawOrderSeconds, minOrderSeconds, maxOrderSeconds);
	const likedSeconds = Math.max(0, narrativeSeconds - orderSeconds);

	const segments: VideoSegment[] = [];

	if (closeShot === wideShot \|\| orderSeconds < 2.5) {
	addSegment(segments, closeShot, orderSeconds);
	} else {
	const closeSeconds = clamp(orderSeconds * 0.58, 1.25, orderSeconds - 0.75);
	addSegment(segments, closeShot, closeSeconds);
	addSegment(segments, wideShot, orderSeconds - closeSeconds);
	}

	addSegment(segments, actionShot, likedSeconds);

	addSegment(segments, reactionShot, reactionSeconds);

	return segments.length > 0 ? segments : buildLinearSegments(videoClips);
	}

	export async function renderClipsOnServer(input: ServerRenderInput): Promise<ServerRenderResult> {
	if (input.videoClips.length === 0) {
	throw new Error('No video clips were uploaded.');
	}

	const runId = Math.random().toString(36).slice(2, 10);
	const runDir = path.join(WORK_DIR, runId);
	await mkdir(runDir, { recursive: true });

	const videoPaths: Array<{ path: string; step: number }> = [];
	const audioPaths: Array<{ path: string; step: number }> = [];
	const outputPath = path.join(runDir, `matcha-server-${runId}.mp4`);

	try {
	for (let i = 0; i < input.videoClips.length; i++) {
	const clip = input.videoClips[i]!;
	const filePath = path.join(runDir, `video-${i}.${safeExt(clip.ext)}`);
	videoPaths.push({
	path: await prepareClipSource(clip, filePath),
	step: inferClipStep(clip, i, 1),
	});
	}

	for (let i = 0; i < input.audioClips.length; i++) {
	const clip = input.audioClips[i]!;
	const filePath = path.join(runDir, `audio-${i}.${safeExt(clip.ext)}`);
	audioPaths.push({
	path: await prepareClipSource(clip, filePath),
	step: inferClipStep(clip, i, 4),
	});
	}

	const [videoDurations, audioDurations, videoAudioFlags] = await Promise.all([
	Promise.all(videoPaths.map((clip) => probeDuration(clip.path))),
	Promise.all(audioPaths.map((clip) => probeDuration(clip.path))),
	Promise.all(videoPaths.map((clip) => probeHasAudio(clip.path))),
	]);

	const preparedVideoClips = videoPaths.map((clip, index) => ({
	path: clip.path,
	step: clip.step,
	duration: videoDurations[index] ?? 0,
	hasAudio: videoAudioFlags[index] ?? false,
	}));
	const preparedAudioClips = audioPaths.map((clip, index) => ({
	path: clip.path,
	step: clip.step,
	duration: audioDurations[index] ?? 0,
	hasAudio: true,
	}));

	const hasVoiceover = audioPaths.length > 0;
	const audioDurationSeconds = audioDurations.reduce((total, duration) => total + duration, 0);
	const voiceoverTargetSeconds =
	Math.max(
	1,
	Math.min(
	FINAL_VIDEO_MAX_SECONDS,
	audioDurationSeconds > 0
	? audioDurationSeconds
	: preparedAudioClips.length >= 2
	? FINAL_VIDEO_MAX_SECONDS
	: MAX_AUDIO_CLIP_SECONDS,
	),
	);
	const videoSegments = hasVoiceover
	? buildVoiceAwareSegments({
	videoClips: preparedVideoClips,
	audioClips: preparedAudioClips,
	renderTargetSeconds: voiceoverTargetSeconds,
	})
	: buildLinearSegments(preparedVideoClips);
	const renderTargetSeconds = hasVoiceover
	? voiceoverTargetSeconds
	: Math.max(
	1,
	Math.min(
	FINAL_VIDEO_MAX_SECONDS,
	videoSegments.reduce((total, segment) => total + segment.duration, 0),
	),
	);

	if (videoSegments.length === 0) {
	throw new Error('No usable video segments were uploaded.');
	}

	const inputArgs = [
	...videoSegments.flatMap((segment) => [
	'-stream_loop',
	'-1',
	'-t',
	formatDuration(segment.duration),
	'-i',
	segment.source.path,
	]),
	...audioPaths.flatMap((clip) => [
	'-t',
	String(MAX_AUDIO_CLIP_SECONDS),
	'-i',
	clip.path,
	]),
	];

	const videoFilters = videoSegments
	.map((segment, i) => {
	return (
	`[${i}:v]trim=duration=${formatDuration(segment.duration)},setpts=PTS-STARTPTS,` +
	`scale=${VIDEO_WIDTH}:${VIDEO_HEIGHT}:force_original_aspect_ratio=decrease,` +
	`pad=${VIDEO_WIDTH}:${VIDEO_HEIGHT}:(ow-iw)/2:(oh-ih)/2,` +
	`setsar=1,fps=${VIDEO_FPS},format=yuv420p[v${i}]`
	);
	})
	.join(';');
	const videoInputs = videoSegments.map((_, i) => `[v${i}]`).join('');

	const audioOffset = videoSegments.length;
	let filterComplex: string;

	if (hasVoiceover) {
	const videoConcat = `${videoInputs}concat=n=${videoSegments.length}:v=1:a=0[vcat]`;
	const videoFinalize = `[vcat]trim=duration=${formatDuration(
	renderTargetSeconds,
	)},setpts=PTS-STARTPTS[v]`;
	const audioFilters = audioPaths
	.map((_, i) => {
	return (
	`[${audioOffset + i}:a]atrim=duration=${MAX_AUDIO_CLIP_SECONDS},` +
	`aresample=48000,aformat=sample_rates=48000:channel_layouts=mono,` +
	`asetpts=PTS-STARTPTS[a${i}]`
	);
	})
	.join(';');
	const audioInputs = audioPaths.map((_, i) => `[a${i}]`).join('');
	const audioConcat = `${audioInputs}concat=n=${audioPaths.length}:v=0:a=1[acat]`;
	const audioFinalize =
	`[acat]atrim=duration=${formatDuration(renderTargetSeconds)},` +
	`apad=whole_dur=${formatDuration(renderTargetSeconds)},asetpts=PTS-STARTPTS[a]`;

	filterComplex = [
	videoFilters,
	videoConcat,
	videoFinalize,
	audioFilters,
	audioConcat,
	audioFinalize,
	]
	.filter(Boolean)
	.join(';');
	} else {
	const embeddedAudioFilters = videoSegments
	.map((segment, i) => {
	if (segment.source.hasAudio) {
	return (
	`[${i}:a]atrim=duration=${formatDuration(segment.duration)},` +
	`aresample=48000,aformat=sample_rates=48000:channel_layouts=mono,` +
	`asetpts=PTS-STARTPTS[a${i}]`
	);
	}

	return (
	`anullsrc=r=48000:cl=mono,atrim=duration=${formatDuration(segment.duration)},` +
	`aformat=sample_rates=48000:channel_layouts=mono,asetpts=PTS-STARTPTS[a${i}]`
	);
	})
	.join(';');
	const avInputs = videoSegments.map((_, i) => `[v${i}][a${i}]`).join('');
	const avConcat = `${avInputs}concat=n=${videoSegments.length}:v=1:a=1[vcat][acat]`;
	const videoFinalize = `[vcat]trim=duration=${formatDuration(
	renderTargetSeconds,
	)},setpts=PTS-STARTPTS[v]`;
	const audioFinalize =
	`[acat]atrim=duration=${formatDuration(renderTargetSeconds)},` +
	`apad=whole_dur=${formatDuration(renderTargetSeconds)},asetpts=PTS-STARTPTS[a]`;

	filterComplex = [
	videoFilters,
	embeddedAudioFilters,
	avConcat,
	videoFinalize,
	audioFinalize,
	]
	.filter(Boolean)
	.join(';');
	}

	// V3 records picture and speech together. Older voiceover renders still use
	// separate audio clips, but both paths now emit one shared audio/video length.
	const outputArgs = ['-map', '[v]', '-map', '[a]'];

	await runCommand(
	'ffmpeg',
	[
	'-y',
	...inputArgs,
	'-filter_complex',
	filterComplex,
	...outputArgs,
	'-c:v',
	'libx264',
	'-preset',
	'ultrafast',
	'-tune',
	'zerolatency',
	'-crf',
	'30',
	'-r',
	String(VIDEO_FPS),
	'-c:a',
	'aac',
	'-b:a',
	'96k',
	'-t',
	formatDuration(renderTargetSeconds),
	'-movflags',
	'+faststart',
	'-avoid_negative_ts',
	'make_zero',
	outputPath,
	],
	'Server clip render',
	);

	const bytes = await readFile(outputPath);
	if (bytes.length <= 0) throw new Error('Server render produced an empty video.');

	return {
	bytes,
	durationSeconds: Math.round(await probeDuration(outputPath)),
	filename: `matcha-server-${runId}.mp4`,
	};
	} finally {
	await rm(runDir, { recursive: true, force: true }).catch(() => undefined);
	}
	}