| import gradio as gr |
| import torch |
| import torchaudio |
| import whisper |
| import cv2 |
| import numpy as np |
| from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip |
| from transformers import pipeline, AutoTokenizer, AutoModel |
| import tempfile |
| import os |
| import json |
| from datetime import timedelta |
| import librosa |
| from scipy.signal import find_peaks |
| import tensorflow as tf |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.metrics.pairwise import cosine_similarity |
| import spacy |
| import nltk |
| from googletrans import Translator |
| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| class ZenVisionModel: |
| """ |
| ZenVision - Advanced AI Subtitle Generation Model |
| Desarrollado por el equipo ZenVision |
| Modelo de 3GB+ con múltiples tecnologías de IA |
| """ |
| |
| def __init__(self): |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" |
| print(f"🚀 Inicializando ZenVision en {self.device}") |
| |
| |
| self.load_models() |
| |
| def load_models(self): |
| """Carga todos los modelos de IA necesarios""" |
| print("📦 Cargando modelos de IA...") |
| |
| |
| self.whisper_model = whisper.load_model("large-v2") |
| |
| |
| self.translator = pipeline("translation", |
| model="Helsinki-NLP/opus-mt-en-mul", |
| device=0 if self.device == "cuda" else -1) |
| |
| |
| self.sentiment_analyzer = pipeline("sentiment-analysis", |
| model="cardiffnlp/twitter-roberta-base-sentiment-latest", |
| device=0 if self.device == "cuda" else -1) |
| |
| |
| self.emotion_detector = pipeline("text-classification", |
| model="j-hartmann/emotion-english-distilroberta-base", |
| device=0 if self.device == "cuda" else -1) |
| |
| |
| self.bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") |
| self.bert_model = AutoModel.from_pretrained("bert-base-multilingual-cased") |
| |
| |
| self.google_translator = Translator() |
| |
| |
| try: |
| self.nlp = spacy.load("en_core_web_sm") |
| except: |
| print("⚠️ Modelo spacy no encontrado, usando funcionalidad básica") |
| self.nlp = None |
| |
| print("✅ Todos los modelos cargados exitosamente") |
| |
| def extract_audio_features(self, video_path): |
| """Extrae características avanzadas del audio""" |
| print("🎵 Extrayendo características de audio...") |
| |
| |
| video = VideoFileClip(video_path) |
| audio_path = tempfile.mktemp(suffix=".wav") |
| video.audio.write_audiofile(audio_path, verbose=False, logger=None) |
| |
| |
| y, sr = librosa.load(audio_path, sr=16000) |
| |
| |
| mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
| spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr) |
| chroma = librosa.feature.chroma_stft(y=y, sr=sr) |
| |
| |
| intervals = librosa.effects.split(y, top_db=20) |
| |
| video.close() |
| os.remove(audio_path) |
| |
| return { |
| 'audio_data': y, |
| 'sample_rate': sr, |
| 'mfccs': mfccs, |
| 'spectral_centroids': spectral_centroids, |
| 'chroma': chroma, |
| 'intervals': intervals, |
| 'duration': len(y) / sr |
| } |
| |
| def advanced_transcription(self, audio_features): |
| """Transcripción avanzada con Whisper y análisis contextual""" |
| print("🎤 Realizando transcripción avanzada...") |
| |
| |
| result = self.whisper_model.transcribe( |
| audio_features['audio_data'], |
| language="auto", |
| word_timestamps=True, |
| verbose=False |
| ) |
| |
| |
| segments = [] |
| for segment in result['segments']: |
| |
| sentiment = self.sentiment_analyzer(segment['text'])[0] |
| |
| |
| emotion = self.emotion_detector(segment['text'])[0] |
| |
| |
| entities = [] |
| if self.nlp: |
| doc = self.nlp(segment['text']) |
| entities = [(ent.text, ent.label_) for ent in doc.ents] |
| |
| segments.append({ |
| 'start': segment['start'], |
| 'end': segment['end'], |
| 'text': segment['text'], |
| 'confidence': segment.get('avg_logprob', 0), |
| 'sentiment': sentiment, |
| 'emotion': emotion, |
| 'entities': entities, |
| 'words': segment.get('words', []) |
| }) |
| |
| return { |
| 'language': result['language'], |
| 'segments': segments, |
| 'full_text': result['text'] |
| } |
| |
| def intelligent_translation(self, transcription, target_language): |
| """Traducción inteligente con múltiples modelos""" |
| print(f"🌍 Traduciendo a {target_language}...") |
| |
| translated_segments = [] |
| |
| for segment in transcription['segments']: |
| original_text = segment['text'] |
| |
| |
| try: |
| google_translation = self.google_translator.translate( |
| original_text, |
| dest=target_language |
| ).text |
| except: |
| google_translation = original_text |
| |
| |
| final_translation = google_translation |
| if segment['entities']: |
| for entity_text, entity_type in segment['entities']: |
| if entity_type in ['PERSON', 'ORG', 'GPE']: |
| final_translation = final_translation.replace( |
| entity_text.lower(), entity_text |
| ) |
| |
| translated_segments.append({ |
| **segment, |
| 'translated_text': final_translation, |
| 'original_text': original_text |
| }) |
| |
| return translated_segments |
| |
| def generate_smart_subtitles(self, segments, video_duration): |
| """Genera subtítulos inteligentes con formato optimizado""" |
| print("📝 Generando subtítulos inteligentes...") |
| |
| subtitles = [] |
| |
| for i, segment in enumerate(segments): |
| |
| duration = segment['end'] - segment['start'] |
| text = segment.get('translated_text', segment['text']) |
| |
| |
| max_chars = 42 |
| max_lines = 2 |
| |
| words = text.split() |
| lines = [] |
| current_line = "" |
| |
| for word in words: |
| if len(current_line + " " + word) <= max_chars: |
| current_line += (" " + word) if current_line else word |
| else: |
| if current_line: |
| lines.append(current_line) |
| current_line = word |
| |
| if len(lines) >= max_lines: |
| break |
| |
| if current_line: |
| lines.append(current_line) |
| |
| |
| subtitle_text = "\n".join(lines[:max_lines]) |
| |
| |
| emotion_label = segment['emotion']['label'] |
| color = self.get_emotion_color(emotion_label) |
| |
| subtitles.append({ |
| 'start': segment['start'], |
| 'end': segment['end'], |
| 'text': subtitle_text, |
| 'emotion': emotion_label, |
| 'color': color, |
| 'confidence': segment['confidence'] |
| }) |
| |
| return subtitles |
| |
| def get_emotion_color(self, emotion): |
| """Asigna colores basados en emociones""" |
| emotion_colors = { |
| 'joy': 'yellow', |
| 'sadness': 'blue', |
| 'anger': 'red', |
| 'fear': 'purple', |
| 'surprise': 'orange', |
| 'disgust': 'green', |
| 'neutral': 'white' |
| } |
| return emotion_colors.get(emotion.lower(), 'white') |
| |
| def create_subtitle_video(self, video_path, subtitles, output_path): |
| """Crea video con subtítulos integrados""" |
| print("🎬 Creando video con subtítulos...") |
| |
| video = VideoFileClip(video_path) |
| subtitle_clips = [] |
| |
| for subtitle in subtitles: |
| |
| txt_clip = TextClip( |
| subtitle['text'], |
| fontsize=24, |
| font='Arial-Bold', |
| color=subtitle['color'], |
| stroke_color='black', |
| stroke_width=2 |
| ).set_position(('center', 'bottom')).set_duration( |
| subtitle['end'] - subtitle['start'] |
| ).set_start(subtitle['start']) |
| |
| subtitle_clips.append(txt_clip) |
| |
| |
| final_video = CompositeVideoClip([video] + subtitle_clips) |
| final_video.write_videofile( |
| output_path, |
| codec='libx264', |
| audio_codec='aac', |
| verbose=False, |
| logger=None |
| ) |
| |
| video.close() |
| final_video.close() |
| |
| return output_path |
| |
| def export_subtitle_formats(self, subtitles, base_path): |
| """Exporta subtítulos en múltiples formatos""" |
| formats = {} |
| |
| |
| srt_path = f"{base_path}.srt" |
| with open(srt_path, 'w', encoding='utf-8') as f: |
| for i, sub in enumerate(subtitles, 1): |
| start_time = self.seconds_to_srt_time(sub['start']) |
| end_time = self.seconds_to_srt_time(sub['end']) |
| f.write(f"{i}\n{start_time} --> {end_time}\n{sub['text']}\n\n") |
| formats['srt'] = srt_path |
| |
| |
| vtt_path = f"{base_path}.vtt" |
| with open(vtt_path, 'w', encoding='utf-8') as f: |
| f.write("WEBVTT\n\n") |
| for sub in subtitles: |
| start_time = self.seconds_to_vtt_time(sub['start']) |
| end_time = self.seconds_to_vtt_time(sub['end']) |
| f.write(f"{start_time} --> {end_time}\n{sub['text']}\n\n") |
| formats['vtt'] = vtt_path |
| |
| |
| json_path = f"{base_path}.json" |
| with open(json_path, 'w', encoding='utf-8') as f: |
| json.dump(subtitles, f, indent=2, ensure_ascii=False) |
| formats['json'] = json_path |
| |
| return formats |
| |
| def seconds_to_srt_time(self, seconds): |
| """Convierte segundos a formato SRT""" |
| td = timedelta(seconds=seconds) |
| hours, remainder = divmod(td.total_seconds(), 3600) |
| minutes, seconds = divmod(remainder, 60) |
| milliseconds = int((seconds % 1) * 1000) |
| return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{milliseconds:03d}" |
| |
| def seconds_to_vtt_time(self, seconds): |
| """Convierte segundos a formato VTT""" |
| td = timedelta(seconds=seconds) |
| hours, remainder = divmod(td.total_seconds(), 3600) |
| minutes, seconds = divmod(remainder, 60) |
| milliseconds = int((seconds % 1) * 1000) |
| return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{milliseconds:03d}" |
| |
| def process_video(self, video_file, target_language="es", include_emotions=True): |
| """Procesa video completo para generar subtítulos""" |
| if video_file is None: |
| return None, None, "Por favor sube un video" |
| |
| try: |
| print("🎯 Iniciando procesamiento con ZenVision...") |
| |
| |
| audio_features = self.extract_audio_features(video_file.name) |
| |
| |
| transcription = self.advanced_transcription(audio_features) |
| |
| |
| if target_language != transcription['language']: |
| segments = self.intelligent_translation(transcription, target_language) |
| else: |
| segments = transcription['segments'] |
| |
| |
| subtitles = self.generate_smart_subtitles(segments, audio_features['duration']) |
| |
| |
| output_video_path = tempfile.mktemp(suffix=".mp4") |
| self.create_subtitle_video(video_file.name, subtitles, output_video_path) |
| |
| |
| subtitle_base_path = tempfile.mktemp() |
| subtitle_formats = self.export_subtitle_formats(subtitles, subtitle_base_path) |
| |
| |
| stats = { |
| 'language_detected': transcription['language'], |
| 'total_segments': len(subtitles), |
| 'duration': audio_features['duration'], |
| 'avg_confidence': np.mean([s['confidence'] for s in segments]), |
| 'emotions_detected': len(set([s['emotion']['label'] for s in segments])) |
| } |
| |
| status_msg = f"""✅ Procesamiento completado con ZenVision! |
| |
| 📊 Estadísticas: |
| • Idioma detectado: {stats['language_detected']} |
| • Segmentos generados: {stats['total_segments']} |
| • Duración: {stats['duration']:.1f}s |
| • Confianza promedio: {stats['avg_confidence']:.2f} |
| • Emociones detectadas: {stats['emotions_detected']} |
| |
| 🎯 Tecnologías utilizadas: |
| • Whisper Large-v2 (Transcripción) |
| • BERT Multilingual (Embeddings) |
| • RoBERTa (Análisis de sentimientos) |
| • DistilRoBERTa (Detección de emociones) |
| • Google Translate (Traducción) |
| • OpenCV + MoviePy (Procesamiento de video) |
| • Librosa (Análisis de audio) |
| • spaCy (NLP avanzado) |
| """ |
| |
| return output_video_path, subtitle_formats['srt'], status_msg |
| |
| except Exception as e: |
| return None, None, f"❌ Error en ZenVision: {str(e)}" |
|
|
| |
| print("🚀 Inicializando ZenVision Model...") |
| zenvision = ZenVisionModel() |
|
|
| |
| with gr.Blocks(title="ZenVision - AI Subtitle Generator", theme=gr.themes.Soft()) as demo: |
| gr.HTML(""" |
| <div style="text-align: center; padding: 20px;"> |
| <h1>🎬 ZenVision AI Subtitle Generator</h1> |
| <p style="font-size: 18px; color: #666;"> |
| Modelo avanzado de subtitulado automático con IA<br> |
| <strong>Desarrollado por el equipo ZenVision</strong> |
| </p> |
| <p style="font-size: 14px; color: #888;"> |
| Modelo de 3GB+ • Whisper • BERT • RoBERTa • OpenCV • Librosa • spaCy |
| </p> |
| </div> |
| """) |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### 📤 Entrada") |
| video_input = gr.Video(label="Subir Video", height=300) |
| |
| with gr.Row(): |
| language_dropdown = gr.Dropdown( |
| choices=[ |
| ("Español", "es"), |
| ("English", "en"), |
| ("Français", "fr"), |
| ("Deutsch", "de"), |
| ("Italiano", "it"), |
| ("Português", "pt"), |
| ("中文", "zh"), |
| ("日本語", "ja"), |
| ("한국어", "ko"), |
| ("Русский", "ru") |
| ], |
| value="es", |
| label="Idioma de destino" |
| ) |
| |
| emotions_checkbox = gr.Checkbox( |
| label="Incluir análisis de emociones", |
| value=True |
| ) |
| |
| process_btn = gr.Button( |
| "🚀 Procesar con ZenVision", |
| variant="primary", |
| size="lg" |
| ) |
| |
| with gr.Column(scale=1): |
| gr.Markdown("### 📥 Resultados") |
| video_output = gr.Video(label="Video con Subtítulos", height=300) |
| subtitle_file = gr.File(label="Archivo de Subtítulos (.srt)") |
| |
| with gr.Row(): |
| status_output = gr.Textbox( |
| label="Estado del Procesamiento", |
| lines=15, |
| interactive=False |
| ) |
| |
| |
| gr.Markdown("### 🎯 Características de ZenVision") |
| gr.HTML(""" |
| <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 20px 0;"> |
| <div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
| <h4>🎤 Transcripción Avanzada</h4> |
| <p>Whisper Large-v2 con timestamps precisos y detección automática de idioma</p> |
| </div> |
| <div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
| <h4>🌍 Traducción Inteligente</h4> |
| <p>Google Translate + preservación de entidades nombradas</p> |
| </div> |
| <div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
| <h4>😊 Análisis Emocional</h4> |
| <p>Detección de emociones y sentimientos con colores adaptativos</p> |
| </div> |
| <div style="padding: 15px; border: 1px solid #ddd; border-radius: 8px;"> |
| <h4>📝 Múltiples Formatos</h4> |
| <p>Exportación en SRT, VTT y JSON con metadatos completos</p> |
| </div> |
| </div> |
| """) |
| |
| |
| process_btn.click( |
| fn=zenvision.process_video, |
| inputs=[video_input, language_dropdown, emotions_checkbox], |
| outputs=[video_output, subtitle_file, status_output] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=True |
| ) |