| import types |
| import random |
| import spaces |
| import logging |
| import os |
| import sys |
| from pathlib import Path |
| from datetime import datetime |
|
|
| |
| print("🚀 Iniciando VEO3 Free Space...") |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| space_id = os.getenv("SPACE_ID") |
| if space_id: |
| print(f"✅ Ejecutando en Space: {space_id}") |
|
|
| |
| import torch |
| if torch.cuda.is_available(): |
| print(f"✅ GPU disponible: {torch.cuda.get_device_name()}") |
| print(f"✅ Memoria GPU: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") |
| else: |
| print("⚠️ GPU no disponible - usando CPU") |
|
|
| import numpy as np |
| import torchaudio |
| from diffusers import AutoencoderKLWan, UniPCMultistepScheduler |
| from diffusers.utils import export_to_video |
| from diffusers import AutoModel |
| import gradio as gr |
| import tempfile |
| from huggingface_hub import hf_hub_download |
|
|
| from src.pipeline_wan_nag import NAGWanPipeline |
| from src.transformer_wan_nag import NagWanTransformer3DModel |
|
|
| |
| try: |
| import mmaudio |
| print("✅ mmaudio importado correctamente") |
| except ImportError: |
| print("📦 Instalando mmaudio...") |
| os.system("pip install -e .") |
| import mmaudio |
| print("✅ mmaudio instalado e importado") |
|
|
| from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate as mmaudio_generate, |
| load_video, make_video, setup_eval_logging) |
| from mmaudio.model.flow_matching import FlowMatching |
| from mmaudio.model.networks import MMAudio, get_my_mmaudio |
| from mmaudio.model.sequence_config import SequenceConfig |
| from mmaudio.model.utils.features_utils import FeaturesUtils |
|
|
| print("✅ Todas las importaciones completadas") |
|
|
| |
| MOD_VALUE = 32 |
| DEFAULT_DURATION_SECONDS = 4 |
| DEFAULT_STEPS = 4 |
| DEFAULT_SEED = 2025 |
| DEFAULT_H_SLIDER_VALUE = 480 |
| DEFAULT_W_SLIDER_VALUE = 832 |
| NEW_FORMULA_MAX_AREA = 480.0 * 832.0 |
|
|
| SLIDER_MIN_H, SLIDER_MAX_H = 128, 896 |
| SLIDER_MIN_W, SLIDER_MAX_W = 128, 896 |
| MAX_SEED = np.iinfo(np.int32).max |
|
|
| FIXED_FPS = 16 |
| MIN_FRAMES_MODEL = 8 |
| MAX_FRAMES_MODEL = 129 |
|
|
| DEFAULT_NAG_NEGATIVE_PROMPT = "Estático, inmóvil, quieto, feo, mala calidad, peor calidad, mal dibujado, baja resolución, borroso, falta de detalles" |
| DEFAULT_AUDIO_NEGATIVE_PROMPT = "música" |
|
|
| |
| MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers" |
| SUB_MODEL_ID = "vrgamedevgirl84/Wan14BT2VFusioniX" |
| SUB_MODEL_FILENAME = "Wan14BT2VFusioniX_fp16_.safetensors" |
| LORA_REPO_ID = "Kijai/WanVideo_comfy" |
| LORA_FILENAME = "Wan21_CausVid_14B_T2V_lora_rank32.safetensors" |
|
|
| |
| torch.backends.cuda.matmul.allow_tf32 = True |
| torch.backends.cudnn.allow_tf32 = True |
| log = logging.getLogger() |
| device = 'cuda' |
| dtype = torch.bfloat16 |
| audio_model_config: ModelConfig = all_model_cfg['large_44k_v2'] |
| audio_model_config.download_if_needed() |
| setup_eval_logging() |
|
|
| |
| try: |
| vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32) |
| wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME) |
| transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16) |
| pipe = NAGWanPipeline.from_pretrained( |
| MODEL_ID, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16 |
| ) |
| pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0) |
| pipe.to("cuda") |
|
|
| pipe.transformer.__class__.attn_processors = NagWanTransformer3DModel.attn_processors |
| pipe.transformer.__class__.set_attn_processor = NagWanTransformer3DModel.set_attn_processor |
| pipe.transformer.__class__.forward = NagWanTransformer3DModel.forward |
| print("¡Modelo de video NAG cargado exitosamente!") |
| except Exception as e: |
| print(f"Error cargando modelo de video NAG: {e}") |
| pipe = None |
|
|
| |
| def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]: |
| seq_cfg = audio_model_config.seq_cfg |
| |
| net: MMAudio = get_my_mmaudio(audio_model_config.model_name).to(device, dtype).eval() |
| net.load_weights(torch.load(audio_model_config.model_path, map_location=device, weights_only=True)) |
| log.info(f'Cargados pesos de MMAudio desde {audio_model_config.model_path}') |
| |
| feature_utils = FeaturesUtils(tod_vae_ckpt=audio_model_config.vae_path, |
| synchformer_ckpt=audio_model_config.synchformer_ckpt, |
| enable_conditions=True, |
| mode=audio_model_config.mode, |
| bigvgan_vocoder_ckpt=audio_model_config.bigvgan_16k_path, |
| need_vae_encoder=False) |
| feature_utils = feature_utils.to(device, dtype).eval() |
| |
| return net, feature_utils, seq_cfg |
|
|
| try: |
| audio_net, audio_feature_utils, audio_seq_cfg = get_mmaudio_model() |
| print("MMAudio Model loaded successfully!") |
| except Exception as e: |
| print(f"Error loading MMAudio Model: {e}") |
| audio_net = None |
|
|
| |
| @torch.inference_mode() |
| def add_audio_to_video(video_path, prompt, audio_negative_prompt, audio_steps, audio_cfg_strength, duration): |
| """Añadir audio automático al video generado""" |
| try: |
| net, feature_utils, seq_cfg = get_mmaudio_model() |
| |
| |
| audio_output = mmaudio_generate( |
| net, feature_utils, seq_cfg, |
| prompt, audio_negative_prompt, |
| audio_steps, audio_cfg_strength, duration |
| ) |
| |
| |
| final_video_path = make_video(video_path, audio_output, duration) |
| |
| return final_video_path |
| except Exception as e: |
| log.error(f"Error generando audio: {e}") |
| return video_path |
|
|
| |
| def get_duration(prompt, nag_negative_prompt, nag_scale, height, width, duration_seconds, |
| steps, seed, randomize_seed, enable_audio, audio_negative_prompt, |
| audio_steps, audio_cfg_strength): |
| |
| base_duration = 30 |
| if enable_audio: |
| audio_duration = 20 |
| return base_duration + audio_duration |
| return base_duration |
|
|
| @spaces.GPU(duration=get_duration) |
| def generate_video_with_audio( |
| prompt, |
| nag_negative_prompt, nag_scale, |
| height=DEFAULT_H_SLIDER_VALUE, width=DEFAULT_W_SLIDER_VALUE, duration_seconds=DEFAULT_DURATION_SECONDS, |
| steps=DEFAULT_STEPS, |
| seed=DEFAULT_SEED, randomize_seed=False, |
| enable_audio=True, audio_negative_prompt=DEFAULT_AUDIO_NEGATIVE_PROMPT, |
| audio_steps=25, audio_cfg_strength=4.5, |
| ): |
| if pipe is None: |
| return None, DEFAULT_SEED |
| |
| try: |
| |
| target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE) |
| target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE) |
| |
| num_frames = np.clip(int(round(int(duration_seconds) * FIXED_FPS) + 1), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL) |
| |
| current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed) |
| |
| print(f"Generando video con: prompt='{prompt}', resolución={target_w}x{target_h}, frames={num_frames}") |
| |
| with torch.inference_mode(): |
| nag_output_frames_list = pipe( |
| prompt=prompt, |
| nag_negative_prompt=nag_negative_prompt, |
| nag_scale=nag_scale, |
| nag_tau=3.5, |
| nag_alpha=0.5, |
| height=target_h, width=target_w, num_frames=num_frames, |
| guidance_scale=0., |
| num_inference_steps=int(steps), |
| generator=torch.Generator(device="cuda").manual_seed(current_seed) |
| ).frames[0] |
| |
| |
| with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile: |
| temp_video_path = tmpfile.name |
| export_to_video(nag_output_frames_list, temp_video_path, fps=FIXED_FPS) |
| print(f"Video guardado en: {temp_video_path}") |
| |
| |
| if enable_audio: |
| try: |
| print("Añadiendo audio al video...") |
| final_video_path = add_audio_to_video( |
| temp_video_path, |
| prompt, |
| audio_negative_prompt, |
| audio_steps, |
| audio_cfg_strength, |
| duration_seconds |
| ) |
| |
| if os.path.exists(temp_video_path) and final_video_path != temp_video_path: |
| os.remove(temp_video_path) |
| print(f"Video final con audio: {final_video_path}") |
| except Exception as e: |
| log.error(f"Falló la generación de audio: {e}") |
| final_video_path = temp_video_path |
| else: |
| final_video_path = temp_video_path |
| |
| return final_video_path, current_seed |
| except Exception as e: |
| print(f"Error en generación de video: {e}") |
| return None, current_seed |
|
|
| |
| def set_example(prompt, nag_negative_prompt, nag_scale): |
| """Establecer valores de ejemplo en la UI sin activar generación""" |
| return ( |
| prompt, |
| nag_negative_prompt, |
| nag_scale, |
| DEFAULT_H_SLIDER_VALUE, |
| DEFAULT_W_SLIDER_VALUE, |
| DEFAULT_DURATION_SECONDS, |
| DEFAULT_STEPS, |
| DEFAULT_SEED, |
| True, |
| True, |
| DEFAULT_AUDIO_NEGATIVE_PROMPT, |
| 25, |
| 4.5 |
| ) |
|
|
| |
| examples = [ |
| ["Autopista de medianoche fuera de una ciudad iluminada con neón. Un Porsche 911 Carrera RS negro de 1973 acelera a 120 km/h. Dentro, un cantante-guitarrista elegante canta mientras conduce, guitarra vintage sunburst en el asiento del pasajero. Las luces de sodio de la calle se deslizan sobre el capó; paneles RGB cambian de magenta a azul en el conductor. Cámara: inmersión de dron, toma baja de rueda con brazo ruso, gimbal interior, barrel roll FPV, espiral aérea. Paleta neo-noir, reflejos de asfalto mojado por lluvia, rugido del motor flat-six mezclado con guitarra en vivo.", DEFAULT_NAG_NEGATIVE_PROMPT, 11], |
| ["Concierto de rock en arena lleno con 20,000 fanáticos. Un guitarrista principal extravagante con chaqueta de cuero y aviators espejados hace shred en una Flying V cereza-roja en un escenario elevado. Llamas de pirotecnia se disparan en cada downbeat, chorros de CO₂ estallan detrás. Luces móviles giran en turquesa y ámbar, follow-spots iluminan el pelo del guitarrista. Steadicam órbita 360°, toma de grúa elevándose sobre la multitud, ultra cámara lenta del ataque de púa a 1,000 fps. Grado de película turquesa-naranja, rugido ensordecedor de la multitud mezclado con solo de guitarra chillón.", DEFAULT_NAG_NEGATIVE_PROMPT, 11], |
| ["Camino rural de hora dorada serpenteando a través de campos de trigo ondulantes. Un hombre y una mujer montan una motocicleta café-racer vintage, pelo y bufanda ondeando en la brisa cálida. Toma de persecución con dron revela campos agrícolas infinitos; slider bajo a lo largo de la rueda trasera captura estela de polvo. Luz de sol retroilumina a los jinetes, bloom de lente en los reflejos. Underscore de rock acústico suave; rugido del motor mezclado a -8 dB. Grado de color cálido pastel, grano de película suave para ambiente nostálgico.", DEFAULT_NAG_NEGATIVE_PROMPT, 11], |
| ] |
|
|
| |
| css = """ |
| /* Columna derecha - salida de video */ |
| .video-output { |
| min-height: 600px; |
| border: 2px dashed #e5e7eb; |
| border-radius: 12px; |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%); |
| } |
| |
| /* Botón de generación */ |
| .generate-btn { |
| background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%) !important; |
| border: none !important; |
| color: white !important; |
| font-weight: 600 !important; |
| padding: 16px 32px !important; |
| border-radius: 12px !important; |
| transition: all 0.3s ease !important; |
| box-shadow: 0 4px 15px rgba(139, 92, 246, 0.3) !important; |
| } |
| |
| .generate-btn:hover { |
| transform: translateY(-2px) !important; |
| box-shadow: 0 8px 25px rgba(139, 92, 246, 0.4) !important; |
| } |
| |
| /* Configuración de audio */ |
| .audio-settings { |
| background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%); |
| border-radius: 12px; |
| padding: 20px; |
| margin: 20px 0; |
| border: 1px solid #f59e0b; |
| } |
| |
| /* Configuración de video */ |
| .video-settings { |
| background: linear-gradient(135deg, #dbeafe 0%, #bfdbfe 100%); |
| border-radius: 12px; |
| padding: 20px; |
| margin: 20px 0; |
| border: 1px solid #3b82f6; |
| } |
| |
| /* Títulos de sección */ |
| .section-title { |
| color: #1f2937; |
| font-weight: 700; |
| margin-bottom: 16px; |
| font-size: 1.25rem; |
| } |
| |
| /* Tooltips y información */ |
| .info-text { |
| color: #6b7280; |
| font-size: 0.875rem; |
| margin-top: 4px; |
| } |
| |
| /* Ejemplos */ |
| .examples-section { |
| background: linear-gradient(135deg, #f3f4f6 0%, #e5e7eb 100%); |
| border-radius: 12px; |
| padding: 20px; |
| margin-top: 20px; |
| } |
| """ |
|
|
| |
| with gr.Blocks(css=css, title="🎬 VEO3 Free - Generador de Video con IA", theme=gr.themes.Soft()) as demo: |
| gr.Markdown(""" |
| # 🎬 VEO3 Free - Generador de Video con IA |
| |
| ### Genera videos profesionales con audio automático usando inteligencia artificial |
| |
| **Características principales:** |
| - 🎥 Generación de video de alta calidad con modelo Wan2.1-T2V-14B |
| - 🔊 Audio automático sincronizado con el contenido visual |
| - ⚡ Generación rápida de 4 pasos con tecnología NAG |
| - 🎨 Resoluciones personalizables de 128x128 a 896x896 |
| - 🎯 Duración ajustable de 1 a 8 segundos |
| |
| --- |
| """) |
| |
| with gr.Row(): |
| with gr.Column(scale=4): |
| |
| gr.Markdown("### 📝 Descripción del Video (también se usa para generación de audio)") |
| prompt = gr.Textbox( |
| label="Describe tu escena de video en detalle...", |
| placeholder="Ej: Un carro deportivo rojo acelerando por una autopista desierta al atardecer, cámara en movimiento desde un dron...", |
| lines=4, |
| max_lines=8, |
| interactive=True |
| ) |
| |
| |
| with gr.Group(elem_classes="video-settings"): |
| gr.Markdown("### 🎥 Configuración Avanzada de Video") |
| |
| with gr.Row(): |
| nag_negative_prompt = gr.Textbox( |
| label="Prompt Negativo del Video", |
| value=DEFAULT_NAG_NEGATIVE_PROMPT, |
| placeholder="Elementos a evitar en el video (ej: estático, borroso, mala calidad)", |
| lines=2 |
| ) |
| |
| with gr.Row(): |
| nag_scale = gr.Slider( |
| minimum=1.0, |
| maximum=20.0, |
| step=0.5, |
| value=11.0, |
| label="🎛️ Escala NAG", |
| info="Mayor escala = mayor adherencia al prompt" |
| ) |
| |
| with gr.Row(): |
| duration_seconds_input = gr.Slider( |
| minimum=1, |
| maximum=8, |
| step=1, |
| value=DEFAULT_DURATION_SECONDS, |
| label="⏱️ Duración (segundos)", |
| info="Duración del video generado" |
| ) |
| steps_slider = gr.Slider( |
| minimum=1, |
| maximum=8, |
| step=1, |
| value=DEFAULT_STEPS, |
| label="🔄 Pasos de Inferencia", |
| info="Más pasos = mejor calidad, pero más lento" |
| ) |
| |
| with gr.Row(): |
| height_input = gr.Slider( |
| minimum=SLIDER_MIN_H, |
| maximum=SLIDER_MAX_H, |
| step=32, |
| value=DEFAULT_H_SLIDER_VALUE, |
| label="📏 Altura (x32)", |
| info="Altura del video en píxeles" |
| ) |
| width_input = gr.Slider( |
| minimum=SLIDER_MIN_W, |
| maximum=SLIDER_MAX_W, |
| step=32, |
| value=DEFAULT_W_SLIDER_VALUE, |
| label="📐 Ancho (x32)", |
| info="Ancho del video en píxeles" |
| ) |
| |
| with gr.Row(): |
| seed_input = gr.Number( |
| label="🎲 Semilla", |
| value=DEFAULT_SEED, |
| interactive=True |
| ) |
| randomize_seed_checkbox = gr.Checkbox( |
| label="🎲 Semilla Aleatoria", |
| value=True, |
| interactive=True |
| ) |
| |
| |
| with gr.Group(elem_classes="audio-settings"): |
| gr.Markdown("### 🎵 Configuración de Generación de Audio") |
| |
| enable_audio = gr.Checkbox( |
| label="🔊 Habilitar Generación Automática de Audio", |
| value=True, |
| interactive=True |
| ) |
| |
| with gr.Column(visible=True) as audio_settings_group: |
| audio_negative_prompt = gr.Textbox( |
| label="Prompt Negativo del Audio", |
| value=DEFAULT_AUDIO_NEGATIVE_PROMPT, |
| placeholder="Elementos a evitar en el audio (ej: música, habla)", |
| ) |
| |
| with gr.Row(): |
| audio_steps = gr.Slider( |
| minimum=10, |
| maximum=50, |
| step=5, |
| value=25, |
| label="🎚️ Pasos de Audio", |
| info="Más pasos = mejor calidad" |
| ) |
| audio_cfg_strength = gr.Slider( |
| minimum=1.0, |
| maximum=10.0, |
| step=0.5, |
| value=4.5, |
| label="🎛️ Guía de Audio", |
| info="Fuerza de la guía del prompt" |
| ) |
| |
| |
| enable_audio.change( |
| fn=lambda x: gr.update(visible=x), |
| inputs=[enable_audio], |
| outputs=[audio_settings_group] |
| ) |
| |
| generate_button = gr.Button( |
| "🎬 Generar Video con Audio", |
| variant="primary", |
| elem_classes="generate-btn" |
| ) |
| |
| with gr.Column(scale=5): |
| video_output = gr.Video( |
| label="Video Generado con Audio", |
| autoplay=True, |
| interactive=False, |
| elem_classes="video-output", |
| height=600 |
| ) |
| |
| gr.HTML(""" |
| <div style="text-align: center; margin-top: 20px; color: #6b7280;"> |
| <p>💡 Consejo: ¡El mismo prompt se usa para la generación de video y audio!</p> |
| <p>🎧 El audio se combina automáticamente con el contenido visual</p> |
| </div> |
| """) |
| |
| |
| with gr.Row(): |
| gr.Markdown("### 🎯 Prompts de Ejemplo") |
| |
| gr.Examples( |
| examples=examples, |
| inputs=[prompt, nag_negative_prompt, nag_scale], |
| outputs=None, |
| cache_examples=False |
| ) |
| |
| |
| ui_inputs = [ |
| prompt, |
| nag_negative_prompt, nag_scale, |
| height_input, width_input, duration_seconds_input, |
| steps_slider, |
| seed_input, randomize_seed_checkbox, |
| enable_audio, audio_negative_prompt, audio_steps, audio_cfg_strength, |
| ] |
| |
| generate_button.click( |
| fn=generate_video_with_audio, |
| inputs=ui_inputs, |
| outputs=[video_output, seed_input], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.queue().launch() |