# ============================== # SECTION 1 — INSTALL + IMPORTS # ============================== import torch import gradio as gr from PIL import Image from transformers import pipeline, BlipProcessor, BlipForQuestionAnswering import lpips import clip from bert_score import score import torchvision.transforms as T from sentence_transformers import SentenceTransformer from rouge_score import rouge_scorer import numpy as np from sklearn.metrics.pairwise import cosine_similarity device = "cuda" if torch.cuda.is_available() else "cpu" def free_gpu_cache(): if torch.cuda.is_available(): torch.cuda.empty_cache() # ============================== # SECTION 2 — LOAD LIGHTWEIGHT MODELS # ============================== blip_large_captioner = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-large", device=0 if device=="cuda" else -1 ) vit_gpt2_captioner = pipeline( "image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=0 if device=="cuda" else -1 ) # --- NLP Pipelines --- sentiment_model = pipeline("sentiment-analysis") ner_model = pipeline("ner", aggregation_strategy="simple") topic_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") # --- Metrics --- clip_model, clip_preprocess = clip.load("ViT-B/32", device=device) lpips_model = lpips.LPIPS(net='alex').to(device) lpips_transform = T.Compose([T.ToTensor(), T.Resize((128,128))]) sentence_model = SentenceTransformer("all-MiniLM-L6-v2") # for cosine similarity # ============================== # SECTION 2b — LAZY LOAD HEAVY MODELS # ============================== blip2_captioner = None vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") vqa_model = None def get_blip2(): global blip2_captioner if blip2_captioner is None: blip2_captioner = pipeline( "image-to-text", model="Salesforce/blip2-opt-2.7b", device=0 if device=="cuda" else -1 ) return blip2_captioner def get_vqa_model(): global vqa_model if vqa_model is None: vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device) return vqa_model # ============================== # SECTION 3 — FUNCTIONS # ============================== def make_captions(img): captions = [] try: captions.append(blip_large_captioner(img)[0]["generated_text"]) except: captions.append("BLIP-large failed.") try: captions.append(vit_gpt2_captioner(img)[0]["generated_text"]) except: captions.append("ViT-GPT2 failed.") try: blip2 = get_blip2() captions.append(blip2(img)[0]["generated_text"]) except: captions.append("BLIP2-opt failed.") return captions # ---------------- Metrics Computation --------------------- def compute_metrics_button(images, captions, idx1, idx2): # CLIP similarity img1_clip = clip_preprocess(images[idx1]).unsqueeze(0).to(device) img2_clip = clip_preprocess(images[idx2]).unsqueeze(0).to(device) with torch.no_grad(): feat1 = clip_model.encode_image(img1_clip) feat2 = clip_model.encode_image(img2_clip) clip_sim = float(torch.cosine_similarity(feat1, feat2).item()) # LPIPS img1_lp = lpips_transform(images[idx1]).unsqueeze(0).to(device) * 2 - 1 img2_lp = lpips_transform(images[idx2]).unsqueeze(0).to(device) * 2 - 1 with torch.no_grad(): lpips_score = float(lpips_model(img1_lp, img2_lp).item()) # BERTScore _, _, F1 = score([captions[idx1]], [captions[idx2]], lang="en", verbose=False) bert_f1 = float(F1.mean().item()) # Cosine similarity of embeddings emb1 = sentence_model.encode([captions[idx1]]) emb2 = sentence_model.encode([captions[idx2]]) cosine_sim = float(cosine_similarity(emb1, emb2)[0][0]) # Jaccard similarity tokens1 = set(captions[idx1].lower().split()) tokens2 = set(captions[idx2].lower().split()) jaccard_sim = float(len(tokens1 & tokens2) / len(tokens1 | tokens2)) # ROUGE scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True) rouge_scores = scorer.score(captions[idx1], captions[idx2]) return f""" - CLIP: {clip_sim:.4f} - LPIPS: {lpips_score:.4f} - BERT-F1: {bert_f1:.4f} - Cosine: {cosine_sim:.4f} - Jaccard: {jaccard_sim:.4f} - ROUGE-1: {rouge_scores['rouge1'].fmeasure:.4f} - ROUGE-L: {rouge_scores['rougeL'].fmeasure:.4f} """ # ---- NLP ---- def nlp_bundle(caption): try: sentiment = sentiment_model(caption) sentiment = "
".join([f"{s['label']}: {s['score']:.2f}" for s in sentiment]) except: sentiment = "Sentiment failed." try: ents_list = ner_model(caption) ents = "
".join([f"{e['entity_group']}: {e['word']}" for e in ents_list]) or "None" except: ents = "NER failed." try: topics_raw = topic_model(caption, candidate_labels=["people","animals","objects","food","nature"]) topics = "
".join([f"{lbl}: {float(scr):.2f}" for lbl, scr in zip(topics_raw["labels"], topics_raw["scores"])]) except: topics = "Topics failed." return sentiment, ents, topics # ---------------- VQA ---------------- def answer_vqa(question, image): if image is None or question.strip() == "": return "Upload an image and enter a question." model = get_vqa_model() inputs = vqa_processor(images=image, text=question, return_tensors="pt").to(device) with torch.no_grad(): generated_ids = model.generate(**inputs) answer = vqa_processor.decode(generated_ids[0], skip_special_tokens=True) free_gpu_cache() return answer # Convert a PIL.Image to PNG byte stream def to_bytes(img): import io buf = io.BytesIO() img.save(buf, format="PNG") return buf.getvalue() # ============================== # SECTION 4 — UI (GRADIO) # ============================== def build_ui(): with gr.Blocks(title="Multimodal AI Image Studio") as demo: gr.HTML(""" """) gr.Markdown("## Multimodal AI Image Studio: Comparative Image-to-Text Analysis", elem_classes="heading-orange") images_state = gr.State([]) captions_state = gr.State([]) # ---------------- Image Input ---------------- gr.Markdown("### Select Image Source", elem_classes="heading-orange") with gr.Tabs(): with gr.Tab("📁 Upload Image"): upload_input = gr.Image(type="pil", sources=["upload"], label="Upload Image", height=900, width=960, elem_classes="circular-img") upload_btn = gr.Button("Generate Captions", elem_classes="orange-btn") with gr.Tab("📷 Webcam"): webcam_input = gr.Image(type="pil", sources=["webcam"], label="Webcam", height=900, width=960, elem_classes="circular-img") webcam_btn = gr.Button("Capture & Generate Captions", elem_classes="orange-btn") with gr.Tab("🔗 From URL"): url_input = gr.Textbox(label="Paste Image URL") url_btn = gr.Button("Fetch & Generate Captions", elem_classes="orange-btn") # ---------------- Previews ---------------- with gr.Row(): with gr.Column(scale=1, min_width=200): preview1 = gr.Image(type="pil",label="Preview 1", interactive=False, height=230) blip_caption_box = gr.Markdown() with gr.Column(scale=1, min_width=200): preview2 = gr.Image(type="pil",label="Preview 2", interactive=False, height=230) vit_caption_box = gr.Markdown() with gr.Column(scale=1, min_width=200): preview3 = gr.Image(type="pil",label="Preview 3", interactive=False, height=230) blip2_caption_box = gr.Markdown() # ---------------- Generate Captions ---------------- def generate_all(img, images_state, captions_state): if img is None: return (None, None, None, "No image.", "No image.", "No image.", [], []) captions = make_captions(img) return (img, img, img, captions[0], captions[1], captions[2], [img], captions) upload_btn.click(generate_all, inputs=[upload_input, images_state, captions_state], outputs=[preview1, preview2, preview3, blip_caption_box, vit_caption_box, blip2_caption_box, images_state, captions_state]) webcam_btn.click(generate_all, inputs=[webcam_input, images_state, captions_state], outputs=[preview1, preview2, preview3, blip_caption_box, vit_caption_box, blip2_caption_box, images_state, captions_state]) def load_from_url(url, images_state, captions_state): import requests from io import BytesIO try: img = Image.open(BytesIO(requests.get(url).content)) except: return (None, None, None, "Bad URL.", "Bad URL.", "Bad URL.", [], []) return generate_all(img, images_state, captions_state) url_btn.click(load_from_url, inputs=[url_input, images_state, captions_state], outputs=[preview1, preview2, preview3, blip_caption_box, vit_caption_box, blip2_caption_box, images_state, captions_state]) # ---------------- Metrics ---------------- gr.Markdown("### Compute Pairwise Metrics", elem_classes="heading-orange") metrics_btn = gr.Button("Compute Metrics for All Pairs", elem_classes="teal-btn") with gr.Row(elem_classes="metrics-row"): metrics_A = gr.Markdown() metrics_B = gr.Markdown() metrics_C = gr.Markdown() def compute_metrics_all_pairs_ui(images, captions): # 3 spinners yield ( "
", "
", "
" ) if len(images) < 1 or len(captions) < 3: msg = "Upload 1 image and generate all 3 captions." yield (msg, msg, msg) return imgs = images * 3 A = compute_metrics_button(imgs, captions, 0, 1) B = compute_metrics_button(imgs, captions, 0, 2) C = compute_metrics_button(imgs, captions, 1, 2) yield ( f"### BLIP-large ↔ ViT-GPT2\n{A}", f"### BLIP-large ↔ BLIP2\n{B}", f"### ViT-GPT2 ↔ BLIP2\n{C}" ) metrics_btn.click( compute_metrics_all_pairs_ui, inputs=[images_state, captions_state], outputs=[metrics_A, metrics_B, metrics_C] ) # ---------------- NLP ---------------- gr.Markdown("### NLP Analysis", elem_classes="heading-orange") nlp_btn = gr.Button("Analyze Captions", elem_classes="teal-btn") with gr.Row(elem_classes="metrics-row"): # reuse metrics-row for flex layout nlp_A = gr.Markdown() nlp_B = gr.Markdown() nlp_C = gr.Markdown() def do_nlp_all(captions): # 3 spinners like metrics yield ( "
", "
", "
" ) if len(captions) < 3: msg = "All 3 captions required." yield (msg, msg, msg) return labels = ["BLIP-large", "ViT-GPT2", "BLIP2"] results = [] for label, cap in zip(labels, captions): s, e, t = nlp_bundle(cap) block = f"""

{label}

Sentiment
{s}

Entities
{e}

Topics
{t} """ results.append(block) yield (results[0], results[1], results[2]) nlp_btn.click(do_nlp_all, inputs=[captions_state], outputs=[nlp_A, nlp_B, nlp_C]) """ # ---------------- NLP ---------------- COMMented out NLP gr.Markdown("### NLP Analysis", elem_classes="heading-orange") nlp_btn = gr.Button("Analyze Captions", elem_classes="teal-btn") nlp_out = gr.HTML() def do_nlp(captions): yield "
" if len(captions) < 3: yield "All captions required." return labels = ["BLIP-large", "ViT-GPT2", "BLIP2"] blocks = [] for label, cap in zip(labels, captions): s, e, t = nlp_bundle(cap) block = f""

{label}

Sentiment
{s}

Entities
{e}

Topics
{t}
"" blocks.append(block) yield f"
{''.join(blocks)}
" nlp_btn.click(do_nlp, inputs=[captions_state], outputs=[nlp_out])""" # ---------------- VQA ---------------- gr.Markdown("### Visual Question Answering (VQA)", elem_classes="heading-orange") with gr.Row(): vqa_input = gr.Textbox(label="Ask about the image") vqa_btn = gr.Button("Get Answer", elem_classes="teal-btn") vqa_out = gr.Markdown() def vqa_ui(question, image): yield "
" yield answer_vqa(question, image) vqa_btn.click(vqa_ui, inputs=[vqa_input, preview1], outputs=[vqa_out]) return demo # ============================== # LAUNCH # ============================== demo = build_ui() demo.launch(share=True, debug=False)